pd...

tallison Tue, 03 Mar 2015 10:53:42 -0800

Author: tallison
Date: Tue Mar  3 18:51:41 2015
New Revision: 1663764

URL: http://svn.apache.org/r1663764
Log:
TIKA-1489 add optional accessibility checking to PDF files


Added:
    
tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java
    
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
    
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Added: 
tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java?rev=1663764&view=auto
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java
 (added)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java
 Tue Mar  3 18:51:41 2015
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.exception;
+
+/**
+ * Exception to be thrown when a document does not allow content extraction.
+ * As of this writing, PDF documents are the only type of document that might
+ * cause this type of exception.
+ */
+public class AccessPermissionException extends TikaException {
+    public AccessPermissionException() {
+        super("Unable to process: content extraction is not allowed");
+    }
+
+    public AccessPermissionException(Throwable th) {
+        super("Unable to process: content extraction is not allowed", th);
+    }
+
+    public AccessPermissionException(String info) {
+        super(info);
+    }
+
+    public AccessPermissionException(String info, Throwable th) {
+        super(info, th);
+    }
+}

Added: 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java?rev=1663764&view=auto
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java
 (added)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java
 Tue Mar  3 18:51:41 2015
@@ -0,0 +1,71 @@
+package org.apache.tika.metadata;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Until we can find a common standard, we'll use these options.  They
+ * were mostly derived from PDFBox's AccessPermission, but some can
+ * apply to other document formats, especially CAN_MODIFY and FILL_IN_FORM.
+ */
+public interface AccessPermissions {
+
+    final static String PREFIX = 
"access_permission"+Metadata.NAMESPACE_PREFIX_DELIMITER;
+
+    /**
+     * Can any modifications be made to the document
+     */
+    Property CAN_MODIFY = Property.externalTextBag(PREFIX+"can_modify");
+
+    /**
+     * Should content be extracted, generally.
+     */
+    Property EXTRACT_CONTENT = Property.externalText(PREFIX+"extract_content");
+
+    /**
+     * Should content be extracted for the purposes
+     * of accessibility.
+     */
+    Property EXTRACT_FOR_ACCESSIBILITY = Property.externalText(PREFIX + 
"extract_for_accessibility");
+
+    /**
+     * Can the user insert/rotate/delete pages.
+     */
+    Property ASSEMBLE_DOCUMENT = 
Property.externalText(PREFIX+"assemble_document");
+
+
+    /**
+     * Can the user fill in a form
+     */
+    Property FILL_IN_FORM = Property.externalText(PREFIX+"fill_in_form");
+
+    /**
+     * Can the user modify annotations
+     */
+    Property CAN_MODIFY_ANNOTATIONS = 
Property.externalText(PREFIX+"modify_annotations");
+
+    /**
+     * Can the user print the document
+     */
+    Property CAN_PRINT = Property.externalText(PREFIX+"can_print");
+
+    /**
+     * Can the user print an image-degraded version of the document.
+     */
+    Property CAN_PRINT_DEGRADED = 
Property.externalText(PREFIX+"can_print_degraded");
+
+}

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java?rev=1663764&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
 Tue Mar  3 18:51:41 2015
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pdf;
+
+import java.io.Serializable;
+
+import org.apache.tika.exception.AccessPermissionException;
+import org.apache.tika.metadata.AccessPermissions;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Checks whether or not a document allows extraction generally
+ * or extraction for accessibility only.
+ */
+public class AccessChecker implements Serializable {
+
+    private static final long serialVersionUID = 6492570218190936986L;
+
+    private final boolean needToCheck;
+    private final boolean allowAccessibility;
+
+    /**
+     * This constructs an {@link AccessChecker} that
+     * will not perform any checking and will always return without
+     * throwing an exception.
+     * <p>
+     * This constructor is available to allow for Tika's legacy ( <= v1.7) 
behavior.
+     */
+    public AccessChecker() {
+        needToCheck = false;
+        allowAccessibility = true;
+    }
+    /**
+     * This constructs an {@link AccessChecker} that will check
+     * for whether or not content should be extracted from a document.
+     *
+     * @param allowExtractionForAccessibility if general extraction is not 
allowed, is extraction for accessibility allowed
+     */
+    public AccessChecker(boolean allowExtractionForAccessibility) {
+        needToCheck = true;
+        this.allowAccessibility = allowExtractionForAccessibility;
+    }
+
+    /**
+     * Checks to see if a document's content should be extracted based
+     * on metadata values and the value of {@link #allowAccessibility} in the 
constructor.
+     *
+     * @param metadata
+     * @throws AccessPermissionException if access is not permitted
+     */
+    public void check(Metadata metadata) throws AccessPermissionException {
+        if (!needToCheck) {
+            return;
+        }
+        if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) {
+            if (allowAccessibility) {
+                
if("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) {
+                    return;
+                }
+                throw new AccessPermissionException("Content extraction for 
accessibility is not allowed.");
+            }
+            throw new AccessPermissionException("Content extraction is not 
allowed.");
+        }
+    }
+}

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1663764&r1=1663763&r2=1663764&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java 
Tue Mar  3 18:51:41 2015
@@ -39,6 +39,7 @@ import org.apache.pdfbox.io.RandomAccess
 import org.apache.pdfbox.io.RandomAccessFile;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
 import org.apache.pdfbox.pdmodel.font.PDFont;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
@@ -46,6 +47,7 @@ import org.apache.tika.extractor.Embedde
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.AccessPermissions;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.Property;
@@ -140,6 +142,9 @@ public class PDFParser extends AbstractP
 
             metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
             extractMetadata(pdfDocument, metadata);
+
+            AccessChecker checker = localConfig.getAccessChecker();
+            checker.check(metadata);
             if (handler != null) {
                 PDF2XHTML.process(pdfDocument, handler, context, metadata, 
localConfig);
             }
@@ -191,6 +196,28 @@ public class PDFParser extends AbstractP
     private void extractMetadata(PDDocument document, Metadata metadata)
             throws TikaException {
 
+        //first extract AccessPermissions
+        AccessPermission ap = document.getCurrentAccessPermission();
+        metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
+                Boolean.toString(ap.canExtractForAccessibility()));
+        metadata.set(AccessPermissions.EXTRACT_CONTENT,
+                Boolean.toString(ap.canExtractContent()));
+        metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT,
+                Boolean.toString(ap.canAssembleDocument()));
+        metadata.set(AccessPermissions.FILL_IN_FORM,
+                Boolean.toString(ap.canFillInForm()));
+        metadata.set(AccessPermissions.CAN_MODIFY,
+                Boolean.toString(ap.canModify()));
+        metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS,
+                Boolean.toString(ap.canModifyAnnotations()));
+        metadata.set(AccessPermissions.CAN_PRINT,
+                Boolean.toString(ap.canPrint()));
+        metadata.set(AccessPermissions.CAN_PRINT_DEGRADED,
+                Boolean.toString(ap.canPrintDegraded()));
+
+
+
+        //now go for the XMP stuff
         org.apache.jempbox.xmp.XMPMetadata xmp = null;
         XMPSchemaDublinCore dcSchema = null;
         try{

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1663764&r1=1663763&r2=1663764&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 Tue Mar  3 18:51:41 2015
@@ -14,20 +14,20 @@ package org.apache.tika.parser.pdf;
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Serializable;
-import java.util.Locale;
-import java.util.Properties;
-
-import org.apache.pdfbox.util.PDFTextStripper;
-
-/**
- * Config for PDFParser.
- * 
+ * limitations under the License.
+ */
+
+import org.apache.pdfbox.util.PDFTextStripper;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Locale;
+import java.util.Properties;
+
+/**
+ * Config for PDFParser.
+ * 
  * This allows parameters to be set programmatically:
  * <ol>
  * <li>Calls to PDFParser, i.e. 
parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li>
@@ -77,12 +77,14 @@ public class PDFParserConfig implements
     //The character width-based tolerance value used to estimate where spaces 
in text should be added
     private Float averageCharTolerance;
     
-    //The space width-based tolerance value used to estimate where spaces in 
text should be added
-    private Float spacingTolerance;
-
-    public PDFParserConfig() {
-        init(this.getClass().getResourceAsStream("PDFParser.properties"));
-    }
+    //The space width-based tolerance value used to estimate where spaces in 
text should be added
+    private Float spacingTolerance;
+
+    private AccessChecker accessChecker;
+
+    public PDFParserConfig() {
+        init(this.getClass().getResourceAsStream("PDFParser.properties"));
+    }
 
     /**
      * Loads properties from InputStream and then tries to close InputStream.
@@ -134,13 +136,24 @@ public class PDFParserConfig implements
         setExtractInlineImages(
                 getProp(props.getProperty("extractInlineImages"),
                 getExtractInlineImages()));
-        setExtractUniqueInlineImagesOnly(
-                getProp(props.getProperty("extractUniqueInlineImagesOnly"),
-                getExtractUniqueInlineImagesOnly()));
-    }
-    
-    /**
-     * Configures the given pdf2XHTML.
+        setExtractUniqueInlineImagesOnly(
+                getProp(props.getProperty("extractUniqueInlineImagesOnly"),
+                getExtractUniqueInlineImagesOnly()));
+
+        boolean checkExtractAccessPermission = 
getProp(props.getProperty("checkExtractAccessPermission"), false);
+        boolean allowExtractionForAccessibility = 
getProp(props.getProperty("allowExtractionForAccessibility"), true);
+
+        if (checkExtractAccessPermission == false) {
+            //silently ignore the crazy configuration of 
checkExtractAccessPermission = false,
+            //but allowExtractionForAccessibility=false
+            accessChecker = new AccessChecker();
+        } else {
+            accessChecker = new AccessChecker(allowExtractionForAccessibility);
+        }
+    }
+    
+    /**
+     * Configures the given pdf2XHTML.
      * 
      * @param pdf2XHTML
      */
@@ -329,12 +342,20 @@ public class PDFParserConfig implements
 
     /**
      * See {@link PDFTextStripper#setSpacingTolerance(float)}
-     */
-    public void setSpacingTolerance(Float spacingTolerance) {
-        this.spacingTolerance = spacingTolerance;
-    }
-
-    private boolean getProp(String p, boolean defaultMissing){
+     */
+    public void setSpacingTolerance(Float spacingTolerance) {
+        this.spacingTolerance = spacingTolerance;
+    }
+
+    public void setAccessChecker(AccessChecker accessChecker) {
+        this.accessChecker = accessChecker;
+    }
+
+    public AccessChecker getAccessChecker() {
+        return accessChecker;
+    }
+
+    private boolean getProp(String p, boolean defaultMissing){
         if (p == null){
             return defaultMissing;
         }

Modified: 
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties?rev=1663764&r1=1663763&r2=1663764&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
 (original)
+++ 
tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
 Tue Mar  3 18:51:41 2015
@@ -18,6 +18,8 @@ extractAnnotationText true
 sortByPosition false
 suppressDuplicateOverlappingText       false
 useNonSequentialParser false
-extractAcroFormContent true
-extractInlineImages false
-extractUniqueInlineImagesOnly true
+extractAcroFormContent true
+extractInlineImages false
+extractUniqueInlineImagesOnly true
+checkExtractAccessPermission false
+allowExtractionForAccessibility true

Added: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java?rev=1663764&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
 (added)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
 Tue Mar  3 18:51:41 2015
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.exception.AccessPermissionException;
+import org.apache.tika.metadata.AccessPermissions;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PropertyTypeException;
+import org.junit.Test;
+
+public class AccessCheckerTest {
+
+    @Test
+    public void testLegacy() throws AccessPermissionException{
+
+        Metadata m = getMetadata(false, false);
+        //legacy behavior; don't bother checking
+        AccessChecker checker = new AccessChecker();
+        checker.check(m);
+        assertTrue("no exception", true);
+
+        m = getMetadata(false, true);
+        assertTrue("no exception", true);
+        checker.check(m);
+
+        m = getMetadata(true, true);
+        assertTrue("no exception", true);
+        checker.check(m);
+    }
+
+    @Test
+    public void testNoExtraction() {
+
+        Metadata m = null;
+        //allow nothing
+        AccessChecker checker = new AccessChecker(false);
+        boolean ex = false;
+        try {
+            m = getMetadata(false, false);
+            checker.check(m);
+        } catch (AccessPermissionException e) {
+            ex = true;
+        }
+        assertTrue("correct exception with no extraction, no extract for 
accessibility", ex);
+        ex = false;
+        try {
+            //document allows extraction for accessibility
+            m = getMetadata(false, true);
+            checker.check(m);
+        } catch (AccessPermissionException e) {
+            //but application is not an accessibility application
+            ex = true;
+        }
+        assertTrue("correct exception with no extraction, no extract for 
accessibility", ex);
+    }
+
+    @Test
+    public void testExtractOnlyForAccessibility() throws 
AccessPermissionException {
+        Metadata m = getMetadata(false, true);
+        //allow accessibility
+        AccessChecker checker = new AccessChecker(true);
+        checker.check(m);
+        assertTrue("no exception", true);
+        boolean ex = false;
+        try {
+            m = getMetadata(false, false);
+            checker.check(m);
+        } catch (AccessPermissionException e) {
+            ex = true;
+        }
+        assertTrue("correct exception", ex);
+    }
+
+    @Test
+    public void testCrazyExtractNotForAccessibility() throws 
AccessPermissionException {
+        Metadata m = getMetadata(true, false);
+        //allow accessibility
+        AccessChecker checker = new AccessChecker(true);
+        checker.check(m);
+        assertTrue("no exception", true);
+
+        //don't extract for accessibility
+        checker = new AccessChecker(false);
+        //if extract content is allowed, the checker shouldn't
+        //check the value of extract for accessibility
+        checker.check(m);
+        assertTrue("no exception", true);
+
+    }
+
+    @Test
+    public void testCantAddMultiplesToMetadata() {
+        Metadata m = new Metadata();
+        boolean ex = false;
+        m.add(AccessPermissions.EXTRACT_CONTENT, "true");
+        try {
+            m.add(AccessPermissions.EXTRACT_CONTENT, "false");
+        } catch (PropertyTypeException e) {
+            ex = true;
+        }
+        assertTrue("can't add multiple values", ex);
+
+        m = new Metadata();
+        ex = false;
+        m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "true");
+        try {
+            m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "false");
+        } catch (PropertyTypeException e) {
+            ex = true;
+        }
+        assertTrue("can't add multiple values", ex);
+    }
+
+    private Metadata getMetadata(boolean allowExtraction, boolean 
allowExtractionForAccessibility) {
+        Metadata m = new Metadata();
+        m.set(AccessPermissions.EXTRACT_CONTENT, 
Boolean.toString(allowExtraction));
+        m.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, 
Boolean.toString(allowExtractionForAccessibility));
+        return m;
+    }
+}

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1663764&r1=1663763&r2=1663764&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 Tue Mar  3 18:51:41 2015
@@ -32,6 +32,7 @@ import java.util.Map;
 import java.util.Set;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.exception.AccessPermissionException;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.ContainerExtractor;
@@ -642,17 +643,20 @@ public class PDFParserTest extends TikaT
                 continue;
             }
 
-            pdfs++;
-            
             String sequentialContent = null;
             Metadata sequentialMetadata = new Metadata();
             try {
                 sequentialContent = getText(new FileInputStream(f), 
                         sequentialParser, seqContext, sequentialMetadata);
+            } catch (EncryptedDocumentException e) {
+                //silently skip a file that requires a user password
+                continue;
             } catch (Exception e) {
                 throw new TikaException("Sequential Parser failed on test file 
" + f, e);
             }
 
+            pdfs++;
+
             String nonSequentialContent = null;
             Metadata nonSequentialMetadata = new Metadata();
             try {
@@ -1138,6 +1142,202 @@ public class PDFParserTest extends TikaT
         assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", xml);
     }
 
+    //Access checker tests
+
+    @Test
+    public void testLegacyAccessChecking() throws Exception {
+        //test that default behavior doesn't throw AccessPermissionException
+        for (String file : new String[] {
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            String xml = getXML(file).xml;
+            assertContains("Hello World", xml);
+        }
+
+        //now try with the user password
+        PasswordProvider provider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "user";
+            }
+        };
+
+        ParseContext context = new ParseContext();
+        context.set(PasswordProvider.class, provider);
+        Parser parser = new AutoDetectParser();
+
+        for (String path : new String[] {
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+        }) {
+            InputStream stream = null;
+            try {
+                stream = 
TikaInputStream.get(this.getClass().getResource("/test-documents/"+path));
+                String text = getText(stream, parser, context);
+                assertContains("Hello World", text);
+            } finally {
+                IOUtils.closeQuietly(stream);
+            }
+        }
+    }
+
+    @Test
+    public void testAccessCheckingEmptyPassword() throws Exception {
+        PDFParserConfig config = new PDFParserConfig();
+
+        //don't allow extraction, not even for accessibility
+        config.setAccessChecker(new AccessChecker(false));
+        Parser parser = new AutoDetectParser();
+        ParseContext context = new ParseContext();
+        context.set(PDFParserConfig.class, config);
+
+        //test exception for empty password
+        for (String path : new String[] {
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertException("/test-documents/"+path, parser, context, 
AccessPermissionException.class);
+        }
+
+        config.setAccessChecker(new AccessChecker(true));
+        assertException("/test-documents/" + 
"testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                parser, context, AccessPermissionException.class);
+
+        InputStream is = null;
+        try {
+            is = getResourceAsStream("/test-documents/"+ 
"testPDF_no_extract_yes_accessibility_owner_empty.pdf");
+            assertContains("Hello World", getText(is, parser, context));
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+    }
+
+    @Test
+    public void testAccessCheckingUserPassword() throws Exception {
+        ParseContext context = new ParseContext();
+
+        PDFParserConfig config = new PDFParserConfig();
+        //don't allow extraction, not even for accessibility
+        config.setAccessChecker(new AccessChecker(false));
+        PasswordProvider passwordProvider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "user";
+            }
+        };
+
+        context.set(PasswordProvider.class, passwordProvider);
+        context.set(PDFParserConfig.class, config);
+
+        Parser parser = new AutoDetectParser();
+
+        //test bad passwords
+        for (String path : new String[] {
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertException("/test-documents/"+path, parser, context, 
EncryptedDocumentException.class);
+        }
+
+        //bad password is still a bad password
+        config.setAccessChecker(new AccessChecker(true));
+        for (String path : new String[] {
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+            assertException("/test-documents/"+path, parser, context, 
EncryptedDocumentException.class);
+        }
+
+        //now test documents that require this "user" password
+        
assertException("/test-documents/"+"testPDF_no_extract_no_accessibility_owner_user.pdf",
+                parser, context, AccessPermissionException.class);
+
+
+        InputStream is = null;
+        try {
+            is = getResourceAsStream("/test-documents/"+ 
"testPDF_no_extract_yes_accessibility_owner_user.pdf");
+            assertContains("Hello World", getText(is, parser, context));
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+
+        config.setAccessChecker(new AccessChecker(false));
+        for (String path : new String[] {
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+        }) {
+            assertException("/test-documents/"+path, parser, context, 
AccessPermissionException.class);
+        }
+    }
+
+    @Test
+    public void testAccessCheckingOwnerPassword() throws Exception {
+        ParseContext context = new ParseContext();
+
+        PDFParserConfig config = new PDFParserConfig();
+        //don't allow extraction, not even for accessibility
+        config.setAccessChecker(new AccessChecker(true));
+        PasswordProvider passwordProvider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "owner";
+            }
+        };
+
+        context.set(PasswordProvider.class, passwordProvider);
+        context.set(PDFParserConfig.class, config);
+
+        Parser parser = new AutoDetectParser();
+        //with owner's password, text can be extracted, no matter the 
AccessibilityChecker's settings
+        for (String path : new String[] {
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+
+            InputStream is = null;
+            try {
+                is = getResourceAsStream("/test-documents/" + 
"testPDF_no_extract_yes_accessibility_owner_user.pdf");
+                assertContains("Hello World", getText(is, parser, context));
+            } finally {
+                IOUtils.closeQuietly(is);
+            }
+        }
+
+        //really, with owner's password, all extraction is allowed
+        config.setAccessChecker(new AccessChecker(false));
+        for (String path : new String[] {
+                "testPDF_no_extract_no_accessibility_owner_user.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_user.pdf",
+                "testPDF_no_extract_no_accessibility_owner_empty.pdf",
+                "testPDF_no_extract_yes_accessibility_owner_empty.pdf",
+        }) {
+
+            InputStream is = null;
+            try {
+                is = getResourceAsStream("/test-documents/" + 
"testPDF_no_extract_yes_accessibility_owner_user.pdf");
+                assertContains("Hello World", getText(is, parser, context));
+            } finally {
+                IOUtils.closeQuietly(is);
+            }
+        }
+    }
+
+    private void assertException(String path, Parser parser, ParseContext 
context, Class expected) {
+        boolean noEx = false;
+        InputStream is = getResourceAsStream(path);
+        try {
+            String text = getText(is, parser, context);
+            noEx = true;
+        } catch (Exception e) {
+            assertEquals("Not the right exception: "+path, expected, 
e.getClass());
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+        assertFalse(path + " should have thrown exception", noEx);
+    }
     /**
      * 
      * Simple class to count end of document events.  If functionality is 
useful,

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf?rev=1663764&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf
 (added)
+++ 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf
 Tue Mar  3 18:51:41 2015
@@ -0,0 +1,87 @@
+%PDF-1.4
+%öäüß
+1 0 obj
+<<
+/Type /Catalog
+/Version /1.4
+/Pages 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/Type /Pages
+/Kids [3 0 R]
+/Count 1
+>>
+endobj
+3 0 obj
+<<
+/Type /Page
+/MediaBox [0.0 0.0 612.0 792.0]
+/Parent 2 0 R
+/Contents 4 0 R
+/Resources 5 0 R
+>>
+endobj
+4 0 obj
+<<
+/Filter [/FlateDecode]
+/Length 6 0 R
+>>
+stream
+KßZz&$ùª8^á":°iÏËIþ%`8etoiczª´Ð       [
+endstream
+endobj
+5 0 obj
+<<
+/Font 7 0 R
+>>
+endobj
+6 0 obj
+50
+endobj
+7 0 obj
+<<
+/F1 8 0 R
+>>
+endobj
+8 0 obj
+<<
+/Subtype /Type1
+/BaseFont /Helvetica-Bold
+/Encoding /WinAnsiEncoding
+>>
+endobj
+9 0 obj
+<<
+/Filter /Standard
+/V 1
+/R 3
+/Length 40
+/P -532
+/O <92EA49CA9DCB5D63ED10DA009E9702A403138C6B0DB22EAD209FC73D70EF86F4>
+/U <A82D4E323C8FE41C5571FA0856FFD74128BF4E5E4E758A4164004E56FFFA0108>
+>>
+endobj
+xref
+0 10
+0000000000 65535 f
+0000000015 00000 n
+0000000078 00000 n
+0000000135 00000 n
+0000000247 00000 n
+0000000375 00000 n
+0000000408 00000 n
+0000000426 00000 n
+0000000457 00000 n
+0000000547 00000 n
+trailer
+<<
+/Root 1 0 R
+/ID [<768A456CFDDEA53BC3965B4569E65812> <768A456CFDDEA53BC3965B4569E65812>]
+/Encrypt 9 0 R
+/Size 10
+>>
+startxref
+755
+%%EOF

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf?rev=1663764&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf
 (added)
+++ 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf
 Tue Mar  3 18:51:41 2015
@@ -0,0 +1,87 @@
+%PDF-1.4
+%öäüß
+1 0 obj
+<<
+/Type /Catalog
+/Version /1.4
+/Pages 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/Type /Pages
+/Kids [3 0 R]
+/Count 1
+>>
+endobj
+3 0 obj
+<<
+/Type /Page
+/MediaBox [0.0 0.0 612.0 792.0]
+/Parent 2 0 R
+/Contents 4 0 R
+/Resources 5 0 R
+>>
+endobj
+4 0 obj
+<<
+/Filter [/FlateDecode]
+/Length 6 0 R
+>>
+stream
+4æ?$7væ/â=©th;U0ªTdRLGÊÎáZÏ¤6aóF¯æéÃ^ªD
+endstream
+endobj
+5 0 obj
+<<
+/Font 7 0 R
+>>
+endobj
+6 0 obj
+50
+endobj
+7 0 obj
+<<
+/F1 8 0 R
+>>
+endobj
+8 0 obj
+<<
+/Subtype /Type1
+/BaseFont /Helvetica-Bold
+/Encoding /WinAnsiEncoding
+>>
+endobj
+9 0 obj
+<<
+/Filter /Standard
+/V 1
+/R 3
+/Length 40
+/P -532
+/O <CF2662E6FB01997CC7651E17056D4DFAD2C78DD5F3F4109BDFFB50433BB04670>
+/U <D803EA55DA7821D2A297F8A68387DCA028BF4E5E4E758A4164004E56FFFA0108>
+>>
+endobj
+xref
+0 10
+0000000000 65535 f
+0000000015 00000 n
+0000000078 00000 n
+0000000135 00000 n
+0000000247 00000 n
+0000000375 00000 n
+0000000408 00000 n
+0000000426 00000 n
+0000000457 00000 n
+0000000547 00000 n
+trailer
+<<
+/Root 1 0 R
+/ID [<75DB321CAFE7680CAD6FC09F51F3DDBE> <75DB321CAFE7680CAD6FC09F51F3DDBE>]
+/Encrypt 9 0 R
+/Size 10
+>>
+startxref
+755
+%%EOF

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf?rev=1663764&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf
 (added)
+++ 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf
 Tue Mar  3 18:51:41 2015
@@ -0,0 +1,87 @@
+%PDF-1.4
+%öäüß
+1 0 obj
+<<
+/Type /Catalog
+/Version /1.4
+/Pages 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/Type /Pages
+/Kids [3 0 R]
+/Count 1
+>>
+endobj
+3 0 obj
+<<
+/Type /Page
+/MediaBox [0.0 0.0 612.0 792.0]
+/Parent 2 0 R
+/Contents 4 0 R
+/Resources 5 0 R
+>>
+endobj
+4 0 obj
+<<
+/Filter [/FlateDecode]
+/Length 6 0 R
+>>
+stream
+õBÓ0Ï6ÜYmñ¤y©mpneÊèÚ¬jÜWü®_WAÐ×D¥Yèõà Vs
+endstream
+endobj
+5 0 obj
+<<
+/Font 7 0 R
+>>
+endobj
+6 0 obj
+50
+endobj
+7 0 obj
+<<
+/F1 8 0 R
+>>
+endobj
+8 0 obj
+<<
+/Subtype /Type1
+/BaseFont /Helvetica-Bold
+/Encoding /WinAnsiEncoding
+>>
+endobj
+9 0 obj
+<<
+/Filter /Standard
+/V 1
+/R 3
+/Length 40
+/P -20
+/O <92EA49CA9DCB5D63ED10DA009E9702A403138C6B0DB22EAD209FC73D70EF86F4>
+/U <472263FD2B9B40403473D05A693D8C0428BF4E5E4E758A4164004E56FFFA0108>
+>>
+endobj
+xref
+0 10
+0000000000 65535 f
+0000000015 00000 n
+0000000078 00000 n
+0000000135 00000 n
+0000000247 00000 n
+0000000375 00000 n
+0000000408 00000 n
+0000000426 00000 n
+0000000457 00000 n
+0000000547 00000 n
+trailer
+<<
+/Root 1 0 R
+/ID [<AFAC4D6B4301475F6B6D846BEACCEA36> <AFAC4D6B4301475F6B6D846BEACCEA36>]
+/Encrypt 9 0 R
+/Size 10
+>>
+startxref
+754
+%%EOF

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf?rev=1663764&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf
 (added)
+++ 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf
 Tue Mar  3 18:51:41 2015
@@ -0,0 +1,87 @@
+%PDF-1.4
+%öäüß
+1 0 obj
+<<
+/Type /Catalog
+/Version /1.4
+/Pages 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/Type /Pages
+/Kids [3 0 R]
+/Count 1
+>>
+endobj
+3 0 obj
+<<
+/Type /Page
+/MediaBox [0.0 0.0 612.0 792.0]
+/Parent 2 0 R
+/Contents 4 0 R
+/Resources 5 0 R
+>>
+endobj
+4 0 obj
+<<
+/Filter [/FlateDecode]
+/Length 6 0 R
+>>
+stream
+Ä3×Ö°6fîÒÒ6üòÄ)FDüxîu K^,´Ü^Ìÿ8Q¥Qý$J
+endstream
+endobj
+5 0 obj
+<<
+/Font 7 0 R
+>>
+endobj
+6 0 obj
+50
+endobj
+7 0 obj
+<<
+/F1 8 0 R
+>>
+endobj
+8 0 obj
+<<
+/Subtype /Type1
+/BaseFont /Helvetica-Bold
+/Encoding /WinAnsiEncoding
+>>
+endobj
+9 0 obj
+<<
+/Filter /Standard
+/V 1
+/R 3
+/Length 40
+/P -20
+/O <CF2662E6FB01997CC7651E17056D4DFAD2C78DD5F3F4109BDFFB50433BB04670>
+/U <067DAA91A1AC99D15ABFA0AD86050F3B28BF4E5E4E758A4164004E56FFFA0108>
+>>
+endobj
+xref
+0 10
+0000000000 65535 f
+0000000015 00000 n
+0000000078 00000 n
+0000000135 00000 n
+0000000247 00000 n
+0000000375 00000 n
+0000000408 00000 n
+0000000426 00000 n
+0000000457 00000 n
+0000000547 00000 n
+trailer
+<<
+/Root 1 0 R
+/ID [<B8090A679399BCAD86E31DE615910182> <B8090A679399BCAD86E31DE615910182>]
+/Encrypt 9 0 R
+/Size 10
+>>
+startxref
+754
+%%EOF

svn commit: r1663764 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/exception/ tika-core/src/main/java/org/apache/tika/metadata/ tika-parsers/src/main/java/org/apache/tika/parser/pdf/ tika-parsers/src/main/resources/org/apache/tika/parser/pd...

Reply via email to