Author: tallison Date: Tue Mar 3 18:51:41 2015 New Revision: 1663764 URL: http://svn.apache.org/r1663764 Log: TIKA-1489 add optional accessibility checking to PDF files
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Added: tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java?rev=1663764&view=auto ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java (added) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java Tue Mar 3 18:51:41 2015 @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.exception; + +/** + * Exception to be thrown when a document does not allow content extraction. + * As of this writing, PDF documents are the only type of document that might + * cause this type of exception. + */ +public class AccessPermissionException extends TikaException { + public AccessPermissionException() { + super("Unable to process: content extraction is not allowed"); + } + + public AccessPermissionException(Throwable th) { + super("Unable to process: content extraction is not allowed", th); + } + + public AccessPermissionException(String info) { + super(info); + } + + public AccessPermissionException(String info, Throwable th) { + super(info, th); + } +} Added: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java?rev=1663764&view=auto ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java (added) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java Tue Mar 3 18:51:41 2015 @@ -0,0 +1,71 @@ +package org.apache.tika.metadata; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Until we can find a common standard, we'll use these options. They + * were mostly derived from PDFBox's AccessPermission, but some can + * apply to other document formats, especially CAN_MODIFY and FILL_IN_FORM. + */ +public interface AccessPermissions { + + final static String PREFIX = "access_permission"+Metadata.NAMESPACE_PREFIX_DELIMITER; + + /** + * Can any modifications be made to the document + */ + Property CAN_MODIFY = Property.externalTextBag(PREFIX+"can_modify"); + + /** + * Should content be extracted, generally. + */ + Property EXTRACT_CONTENT = Property.externalText(PREFIX+"extract_content"); + + /** + * Should content be extracted for the purposes + * of accessibility. + */ + Property EXTRACT_FOR_ACCESSIBILITY = Property.externalText(PREFIX + "extract_for_accessibility"); + + /** + * Can the user insert/rotate/delete pages. + */ + Property ASSEMBLE_DOCUMENT = Property.externalText(PREFIX+"assemble_document"); + + + /** + * Can the user fill in a form + */ + Property FILL_IN_FORM = Property.externalText(PREFIX+"fill_in_form"); + + /** + * Can the user modify annotations + */ + Property CAN_MODIFY_ANNOTATIONS = Property.externalText(PREFIX+"modify_annotations"); + + /** + * Can the user print the document + */ + Property CAN_PRINT = Property.externalText(PREFIX+"can_print"); + + /** + * Can the user print an image-degraded version of the document. + */ + Property CAN_PRINT_DEGRADED = Property.externalText(PREFIX+"can_print_degraded"); + +} Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java?rev=1663764&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java (added) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java Tue Mar 3 18:51:41 2015 @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.pdf; + +import java.io.Serializable; + +import org.apache.tika.exception.AccessPermissionException; +import org.apache.tika.metadata.AccessPermissions; +import org.apache.tika.metadata.Metadata; + +/** + * Checks whether or not a document allows extraction generally + * or extraction for accessibility only. + */ +public class AccessChecker implements Serializable { + + private static final long serialVersionUID = 6492570218190936986L; + + private final boolean needToCheck; + private final boolean allowAccessibility; + + /** + * This constructs an {@link AccessChecker} that + * will not perform any checking and will always return without + * throwing an exception. + * <p> + * This constructor is available to allow for Tika's legacy ( <= v1.7) behavior. + */ + public AccessChecker() { + needToCheck = false; + allowAccessibility = true; + } + /** + * This constructs an {@link AccessChecker} that will check + * for whether or not content should be extracted from a document. + * + * @param allowExtractionForAccessibility if general extraction is not allowed, is extraction for accessibility allowed + */ + public AccessChecker(boolean allowExtractionForAccessibility) { + needToCheck = true; + this.allowAccessibility = allowExtractionForAccessibility; + } + + /** + * Checks to see if a document's content should be extracted based + * on metadata values and the value of {@link #allowAccessibility} in the constructor. + * + * @param metadata + * @throws AccessPermissionException if access is not permitted + */ + public void check(Metadata metadata) throws AccessPermissionException { + if (!needToCheck) { + return; + } + if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) { + if (allowAccessibility) { + if("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) { + return; + } + throw new AccessPermissionException("Content extraction for accessibility is not allowed."); + } + throw new AccessPermissionException("Content extraction is not allowed."); + } + } +} Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1663764&r1=1663763&r2=1663764&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Tue Mar 3 18:51:41 2015 @@ -39,6 +39,7 @@ import org.apache.pdfbox.io.RandomAccess import org.apache.pdfbox.io.RandomAccessFile; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; @@ -46,6 +47,7 @@ import org.apache.tika.extractor.Embedde import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.AccessPermissions; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.Property; @@ -140,6 +142,9 @@ public class PDFParser extends AbstractP metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); extractMetadata(pdfDocument, metadata); + + AccessChecker checker = localConfig.getAccessChecker(); + checker.check(metadata); if (handler != null) { PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } @@ -191,6 +196,28 @@ public class PDFParser extends AbstractP private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { + //first extract AccessPermissions + AccessPermission ap = document.getCurrentAccessPermission(); + metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, + Boolean.toString(ap.canExtractForAccessibility())); + metadata.set(AccessPermissions.EXTRACT_CONTENT, + Boolean.toString(ap.canExtractContent())); + metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, + Boolean.toString(ap.canAssembleDocument())); + metadata.set(AccessPermissions.FILL_IN_FORM, + Boolean.toString(ap.canFillInForm())); + metadata.set(AccessPermissions.CAN_MODIFY, + Boolean.toString(ap.canModify())); + metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, + Boolean.toString(ap.canModifyAnnotations())); + metadata.set(AccessPermissions.CAN_PRINT, + Boolean.toString(ap.canPrint())); + metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, + Boolean.toString(ap.canPrintDegraded())); + + + + //now go for the XMP stuff org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; try{ Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1663764&r1=1663763&r2=1663764&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java Tue Mar 3 18:51:41 2015 @@ -14,20 +14,20 @@ package org.apache.tika.parser.pdf; * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.InputStream; -import java.io.Serializable; -import java.util.Locale; -import java.util.Properties; - -import org.apache.pdfbox.util.PDFTextStripper; - -/** - * Config for PDFParser. - * + * limitations under the License. + */ + +import org.apache.pdfbox.util.PDFTextStripper; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.util.Locale; +import java.util.Properties; + +/** + * Config for PDFParser. + * * This allows parameters to be set programmatically: * <ol> * <li>Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li> @@ -77,12 +77,14 @@ public class PDFParserConfig implements //The character width-based tolerance value used to estimate where spaces in text should be added private Float averageCharTolerance; - //The space width-based tolerance value used to estimate where spaces in text should be added - private Float spacingTolerance; - - public PDFParserConfig() { - init(this.getClass().getResourceAsStream("PDFParser.properties")); - } + //The space width-based tolerance value used to estimate where spaces in text should be added + private Float spacingTolerance; + + private AccessChecker accessChecker; + + public PDFParserConfig() { + init(this.getClass().getResourceAsStream("PDFParser.properties")); + } /** * Loads properties from InputStream and then tries to close InputStream. @@ -134,13 +136,24 @@ public class PDFParserConfig implements setExtractInlineImages( getProp(props.getProperty("extractInlineImages"), getExtractInlineImages())); - setExtractUniqueInlineImagesOnly( - getProp(props.getProperty("extractUniqueInlineImagesOnly"), - getExtractUniqueInlineImagesOnly())); - } - - /** - * Configures the given pdf2XHTML. + setExtractUniqueInlineImagesOnly( + getProp(props.getProperty("extractUniqueInlineImagesOnly"), + getExtractUniqueInlineImagesOnly())); + + boolean checkExtractAccessPermission = getProp(props.getProperty("checkExtractAccessPermission"), false); + boolean allowExtractionForAccessibility = getProp(props.getProperty("allowExtractionForAccessibility"), true); + + if (checkExtractAccessPermission == false) { + //silently ignore the crazy configuration of checkExtractAccessPermission = false, + //but allowExtractionForAccessibility=false + accessChecker = new AccessChecker(); + } else { + accessChecker = new AccessChecker(allowExtractionForAccessibility); + } + } + + /** + * Configures the given pdf2XHTML. * * @param pdf2XHTML */ @@ -329,12 +342,20 @@ public class PDFParserConfig implements /** * See {@link PDFTextStripper#setSpacingTolerance(float)} - */ - public void setSpacingTolerance(Float spacingTolerance) { - this.spacingTolerance = spacingTolerance; - } - - private boolean getProp(String p, boolean defaultMissing){ + */ + public void setSpacingTolerance(Float spacingTolerance) { + this.spacingTolerance = spacingTolerance; + } + + public void setAccessChecker(AccessChecker accessChecker) { + this.accessChecker = accessChecker; + } + + public AccessChecker getAccessChecker() { + return accessChecker; + } + + private boolean getProp(String p, boolean defaultMissing){ if (p == null){ return defaultMissing; } Modified: tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties?rev=1663764&r1=1663763&r2=1663764&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties (original) +++ tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties Tue Mar 3 18:51:41 2015 @@ -18,6 +18,8 @@ extractAnnotationText true sortByPosition false suppressDuplicateOverlappingText false useNonSequentialParser false -extractAcroFormContent true -extractInlineImages false -extractUniqueInlineImagesOnly true +extractAcroFormContent true +extractInlineImages false +extractUniqueInlineImagesOnly true +checkExtractAccessPermission false +allowExtractionForAccessibility true Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java?rev=1663764&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java (added) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java Tue Mar 3 18:51:41 2015 @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + + +import static org.junit.Assert.assertTrue; + +import org.apache.tika.exception.AccessPermissionException; +import org.apache.tika.metadata.AccessPermissions; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.PropertyTypeException; +import org.junit.Test; + +public class AccessCheckerTest { + + @Test + public void testLegacy() throws AccessPermissionException{ + + Metadata m = getMetadata(false, false); + //legacy behavior; don't bother checking + AccessChecker checker = new AccessChecker(); + checker.check(m); + assertTrue("no exception", true); + + m = getMetadata(false, true); + assertTrue("no exception", true); + checker.check(m); + + m = getMetadata(true, true); + assertTrue("no exception", true); + checker.check(m); + } + + @Test + public void testNoExtraction() { + + Metadata m = null; + //allow nothing + AccessChecker checker = new AccessChecker(false); + boolean ex = false; + try { + m = getMetadata(false, false); + checker.check(m); + } catch (AccessPermissionException e) { + ex = true; + } + assertTrue("correct exception with no extraction, no extract for accessibility", ex); + ex = false; + try { + //document allows extraction for accessibility + m = getMetadata(false, true); + checker.check(m); + } catch (AccessPermissionException e) { + //but application is not an accessibility application + ex = true; + } + assertTrue("correct exception with no extraction, no extract for accessibility", ex); + } + + @Test + public void testExtractOnlyForAccessibility() throws AccessPermissionException { + Metadata m = getMetadata(false, true); + //allow accessibility + AccessChecker checker = new AccessChecker(true); + checker.check(m); + assertTrue("no exception", true); + boolean ex = false; + try { + m = getMetadata(false, false); + checker.check(m); + } catch (AccessPermissionException e) { + ex = true; + } + assertTrue("correct exception", ex); + } + + @Test + public void testCrazyExtractNotForAccessibility() throws AccessPermissionException { + Metadata m = getMetadata(true, false); + //allow accessibility + AccessChecker checker = new AccessChecker(true); + checker.check(m); + assertTrue("no exception", true); + + //don't extract for accessibility + checker = new AccessChecker(false); + //if extract content is allowed, the checker shouldn't + //check the value of extract for accessibility + checker.check(m); + assertTrue("no exception", true); + + } + + @Test + public void testCantAddMultiplesToMetadata() { + Metadata m = new Metadata(); + boolean ex = false; + m.add(AccessPermissions.EXTRACT_CONTENT, "true"); + try { + m.add(AccessPermissions.EXTRACT_CONTENT, "false"); + } catch (PropertyTypeException e) { + ex = true; + } + assertTrue("can't add multiple values", ex); + + m = new Metadata(); + ex = false; + m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "true"); + try { + m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "false"); + } catch (PropertyTypeException e) { + ex = true; + } + assertTrue("can't add multiple values", ex); + } + + private Metadata getMetadata(boolean allowExtraction, boolean allowExtractionForAccessibility) { + Metadata m = new Metadata(); + m.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(allowExtraction)); + m.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(allowExtractionForAccessibility)); + return m; + } +} Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1663764&r1=1663763&r2=1663764&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Tue Mar 3 18:51:41 2015 @@ -32,6 +32,7 @@ import java.util.Map; import java.util.Set; import org.apache.tika.TikaTest; +import org.apache.tika.exception.AccessPermissionException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.ContainerExtractor; @@ -642,17 +643,20 @@ public class PDFParserTest extends TikaT continue; } - pdfs++; - String sequentialContent = null; Metadata sequentialMetadata = new Metadata(); try { sequentialContent = getText(new FileInputStream(f), sequentialParser, seqContext, sequentialMetadata); + } catch (EncryptedDocumentException e) { + //silently skip a file that requires a user password + continue; } catch (Exception e) { throw new TikaException("Sequential Parser failed on test file " + f, e); } + pdfs++; + String nonSequentialContent = null; Metadata nonSequentialMetadata = new Metadata(); try { @@ -1138,6 +1142,202 @@ public class PDFParserTest extends TikaT assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", xml); } + //Access checker tests + + @Test + public void testLegacyAccessChecking() throws Exception { + //test that default behavior doesn't throw AccessPermissionException + for (String file : new String[] { + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf", + }) { + String xml = getXML(file).xml; + assertContains("Hello World", xml); + } + + //now try with the user password + PasswordProvider provider = new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "user"; + } + }; + + ParseContext context = new ParseContext(); + context.set(PasswordProvider.class, provider); + Parser parser = new AutoDetectParser(); + + for (String path : new String[] { + "testPDF_no_extract_no_accessibility_owner_user.pdf", + "testPDF_no_extract_yes_accessibility_owner_user.pdf", + }) { + InputStream stream = null; + try { + stream = TikaInputStream.get(this.getClass().getResource("/test-documents/"+path)); + String text = getText(stream, parser, context); + assertContains("Hello World", text); + } finally { + IOUtils.closeQuietly(stream); + } + } + } + + @Test + public void testAccessCheckingEmptyPassword() throws Exception { + PDFParserConfig config = new PDFParserConfig(); + + //don't allow extraction, not even for accessibility + config.setAccessChecker(new AccessChecker(false)); + Parser parser = new AutoDetectParser(); + ParseContext context = new ParseContext(); + context.set(PDFParserConfig.class, config); + + //test exception for empty password + for (String path : new String[] { + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf", + }) { + assertException("/test-documents/"+path, parser, context, AccessPermissionException.class); + } + + config.setAccessChecker(new AccessChecker(true)); + assertException("/test-documents/" + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + parser, context, AccessPermissionException.class); + + InputStream is = null; + try { + is = getResourceAsStream("/test-documents/"+ "testPDF_no_extract_yes_accessibility_owner_empty.pdf"); + assertContains("Hello World", getText(is, parser, context)); + } finally { + IOUtils.closeQuietly(is); + } + } + + @Test + public void testAccessCheckingUserPassword() throws Exception { + ParseContext context = new ParseContext(); + + PDFParserConfig config = new PDFParserConfig(); + //don't allow extraction, not even for accessibility + config.setAccessChecker(new AccessChecker(false)); + PasswordProvider passwordProvider = new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "user"; + } + }; + + context.set(PasswordProvider.class, passwordProvider); + context.set(PDFParserConfig.class, config); + + Parser parser = new AutoDetectParser(); + + //test bad passwords + for (String path : new String[] { + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf", + }) { + assertException("/test-documents/"+path, parser, context, EncryptedDocumentException.class); + } + + //bad password is still a bad password + config.setAccessChecker(new AccessChecker(true)); + for (String path : new String[] { + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf", + }) { + assertException("/test-documents/"+path, parser, context, EncryptedDocumentException.class); + } + + //now test documents that require this "user" password + assertException("/test-documents/"+"testPDF_no_extract_no_accessibility_owner_user.pdf", + parser, context, AccessPermissionException.class); + + + InputStream is = null; + try { + is = getResourceAsStream("/test-documents/"+ "testPDF_no_extract_yes_accessibility_owner_user.pdf"); + assertContains("Hello World", getText(is, parser, context)); + } finally { + IOUtils.closeQuietly(is); + } + + config.setAccessChecker(new AccessChecker(false)); + for (String path : new String[] { + "testPDF_no_extract_no_accessibility_owner_user.pdf", + "testPDF_no_extract_yes_accessibility_owner_user.pdf", + }) { + assertException("/test-documents/"+path, parser, context, AccessPermissionException.class); + } + } + + @Test + public void testAccessCheckingOwnerPassword() throws Exception { + ParseContext context = new ParseContext(); + + PDFParserConfig config = new PDFParserConfig(); + //don't allow extraction, not even for accessibility + config.setAccessChecker(new AccessChecker(true)); + PasswordProvider passwordProvider = new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "owner"; + } + }; + + context.set(PasswordProvider.class, passwordProvider); + context.set(PDFParserConfig.class, config); + + Parser parser = new AutoDetectParser(); + //with owner's password, text can be extracted, no matter the AccessibilityChecker's settings + for (String path : new String[] { + "testPDF_no_extract_no_accessibility_owner_user.pdf", + "testPDF_no_extract_yes_accessibility_owner_user.pdf", + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf", + }) { + + InputStream is = null; + try { + is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf"); + assertContains("Hello World", getText(is, parser, context)); + } finally { + IOUtils.closeQuietly(is); + } + } + + //really, with owner's password, all extraction is allowed + config.setAccessChecker(new AccessChecker(false)); + for (String path : new String[] { + "testPDF_no_extract_no_accessibility_owner_user.pdf", + "testPDF_no_extract_yes_accessibility_owner_user.pdf", + "testPDF_no_extract_no_accessibility_owner_empty.pdf", + "testPDF_no_extract_yes_accessibility_owner_empty.pdf", + }) { + + InputStream is = null; + try { + is = getResourceAsStream("/test-documents/" + "testPDF_no_extract_yes_accessibility_owner_user.pdf"); + assertContains("Hello World", getText(is, parser, context)); + } finally { + IOUtils.closeQuietly(is); + } + } + } + + private void assertException(String path, Parser parser, ParseContext context, Class expected) { + boolean noEx = false; + InputStream is = getResourceAsStream(path); + try { + String text = getText(is, parser, context); + noEx = true; + } catch (Exception e) { + assertEquals("Not the right exception: "+path, expected, e.getClass()); + } finally { + IOUtils.closeQuietly(is); + } + assertFalse(path + " should have thrown exception", noEx); + } /** * * Simple class to count end of document events. If functionality is useful, Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf?rev=1663764&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf (added) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_empty.pdf Tue Mar 3 18:51:41 2015 @@ -0,0 +1,87 @@ +%PDF-1.4 +%öäüß +1 0 obj +<< +/Type /Catalog +/Version /1.4 +/Pages 2 0 R +>> +endobj +2 0 obj +<< +/Type /Pages +/Kids [3 0 R] +/Count 1 +>> +endobj +3 0 obj +<< +/Type /Page +/MediaBox [0.0 0.0 612.0 792.0] +/Parent 2 0 R +/Contents 4 0 R +/Resources 5 0 R +>> +endobj +4 0 obj +<< +/Filter [/FlateDecode] +/Length 6 0 R +>> +stream +KßZz&$ùª8^á" :°iÏËIþ%`8etoiczª´Ð [ +endstream +endobj +5 0 obj +<< +/Font 7 0 R +>> +endobj +6 0 obj +50 +endobj +7 0 obj +<< +/F1 8 0 R +>> +endobj +8 0 obj +<< +/Subtype /Type1 +/BaseFont /Helvetica-Bold +/Encoding /WinAnsiEncoding +>> +endobj +9 0 obj +<< +/Filter /Standard +/V 1 +/R 3 +/Length 40 +/P -532 +/O <92EA49CA9DCB5D63ED10DA009E9702A403138C6B0DB22EAD209FC73D70EF86F4> +/U <A82D4E323C8FE41C5571FA0856FFD74128BF4E5E4E758A4164004E56FFFA0108> +>> +endobj +xref +0 10 +0000000000 65535 f +0000000015 00000 n +0000000078 00000 n +0000000135 00000 n +0000000247 00000 n +0000000375 00000 n +0000000408 00000 n +0000000426 00000 n +0000000457 00000 n +0000000547 00000 n +trailer +<< +/Root 1 0 R +/ID [<768A456CFDDEA53BC3965B4569E65812> <768A456CFDDEA53BC3965B4569E65812>] +/Encrypt 9 0 R +/Size 10 +>> +startxref +755 +%%EOF Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf?rev=1663764&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf (added) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_no_accessibility_owner_user.pdf Tue Mar 3 18:51:41 2015 @@ -0,0 +1,87 @@ +%PDF-1.4 +%öäüß +1 0 obj +<< +/Type /Catalog +/Version /1.4 +/Pages 2 0 R +>> +endobj +2 0 obj +<< +/Type /Pages +/Kids [3 0 R] +/Count 1 +>> +endobj +3 0 obj +<< +/Type /Page +/MediaBox [0.0 0.0 612.0 792.0] +/Parent 2 0 R +/Contents 4 0 R +/Resources 5 0 R +>> +endobj +4 0 obj +<< +/Filter [/FlateDecode] +/Length 6 0 R +>> +stream +4æ?$7væ/â=©th;U0ªTdRLGÊÎáZϤ6aóF¯æéÃ^ªD +endstream +endobj +5 0 obj +<< +/Font 7 0 R +>> +endobj +6 0 obj +50 +endobj +7 0 obj +<< +/F1 8 0 R +>> +endobj +8 0 obj +<< +/Subtype /Type1 +/BaseFont /Helvetica-Bold +/Encoding /WinAnsiEncoding +>> +endobj +9 0 obj +<< +/Filter /Standard +/V 1 +/R 3 +/Length 40 +/P -532 +/O <CF2662E6FB01997CC7651E17056D4DFAD2C78DD5F3F4109BDFFB50433BB04670> +/U <D803EA55DA7821D2A297F8A68387DCA028BF4E5E4E758A4164004E56FFFA0108> +>> +endobj +xref +0 10 +0000000000 65535 f +0000000015 00000 n +0000000078 00000 n +0000000135 00000 n +0000000247 00000 n +0000000375 00000 n +0000000408 00000 n +0000000426 00000 n +0000000457 00000 n +0000000547 00000 n +trailer +<< +/Root 1 0 R +/ID [<75DB321CAFE7680CAD6FC09F51F3DDBE> <75DB321CAFE7680CAD6FC09F51F3DDBE>] +/Encrypt 9 0 R +/Size 10 +>> +startxref +755 +%%EOF Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf?rev=1663764&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf (added) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_empty.pdf Tue Mar 3 18:51:41 2015 @@ -0,0 +1,87 @@ +%PDF-1.4 +%öäüß +1 0 obj +<< +/Type /Catalog +/Version /1.4 +/Pages 2 0 R +>> +endobj +2 0 obj +<< +/Type /Pages +/Kids [3 0 R] +/Count 1 +>> +endobj +3 0 obj +<< +/Type /Page +/MediaBox [0.0 0.0 612.0 792.0] +/Parent 2 0 R +/Contents 4 0 R +/Resources 5 0 R +>> +endobj +4 0 obj +<< +/Filter [/FlateDecode] +/Length 6 0 R +>> +stream +õBÓ0Ï6ÜYmñ¤y©mpneÊèÚ¬jÜWü®_WAÐ×D¥Yèõà Vs +endstream +endobj +5 0 obj +<< +/Font 7 0 R +>> +endobj +6 0 obj +50 +endobj +7 0 obj +<< +/F1 8 0 R +>> +endobj +8 0 obj +<< +/Subtype /Type1 +/BaseFont /Helvetica-Bold +/Encoding /WinAnsiEncoding +>> +endobj +9 0 obj +<< +/Filter /Standard +/V 1 +/R 3 +/Length 40 +/P -20 +/O <92EA49CA9DCB5D63ED10DA009E9702A403138C6B0DB22EAD209FC73D70EF86F4> +/U <472263FD2B9B40403473D05A693D8C0428BF4E5E4E758A4164004E56FFFA0108> +>> +endobj +xref +0 10 +0000000000 65535 f +0000000015 00000 n +0000000078 00000 n +0000000135 00000 n +0000000247 00000 n +0000000375 00000 n +0000000408 00000 n +0000000426 00000 n +0000000457 00000 n +0000000547 00000 n +trailer +<< +/Root 1 0 R +/ID [<AFAC4D6B4301475F6B6D846BEACCEA36> <AFAC4D6B4301475F6B6D846BEACCEA36>] +/Encrypt 9 0 R +/Size 10 +>> +startxref +754 +%%EOF Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf?rev=1663764&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf (added) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_no_extract_yes_accessibility_owner_user.pdf Tue Mar 3 18:51:41 2015 @@ -0,0 +1,87 @@ +%PDF-1.4 +%öäüß +1 0 obj +<< +/Type /Catalog +/Version /1.4 +/Pages 2 0 R +>> +endobj +2 0 obj +<< +/Type /Pages +/Kids [3 0 R] +/Count 1 +>> +endobj +3 0 obj +<< +/Type /Page +/MediaBox [0.0 0.0 612.0 792.0] +/Parent 2 0 R +/Contents 4 0 R +/Resources 5 0 R +>> +endobj +4 0 obj +<< +/Filter [/FlateDecode] +/Length 6 0 R +>> +stream +Ä3×Ö°6fîÒÒ6üòÄ)FDüxîu K^,´Ü^Ìÿ8Q¥Qý$J +endstream +endobj +5 0 obj +<< +/Font 7 0 R +>> +endobj +6 0 obj +50 +endobj +7 0 obj +<< +/F1 8 0 R +>> +endobj +8 0 obj +<< +/Subtype /Type1 +/BaseFont /Helvetica-Bold +/Encoding /WinAnsiEncoding +>> +endobj +9 0 obj +<< +/Filter /Standard +/V 1 +/R 3 +/Length 40 +/P -20 +/O <CF2662E6FB01997CC7651E17056D4DFAD2C78DD5F3F4109BDFFB50433BB04670> +/U <067DAA91A1AC99D15ABFA0AD86050F3B28BF4E5E4E758A4164004E56FFFA0108> +>> +endobj +xref +0 10 +0000000000 65535 f +0000000015 00000 n +0000000078 00000 n +0000000135 00000 n +0000000247 00000 n +0000000375 00000 n +0000000408 00000 n +0000000426 00000 n +0000000457 00000 n +0000000547 00000 n +trailer +<< +/Root 1 0 R +/ID [<B8090A679399BCAD86E31DE615910182> <B8090A679399BCAD86E31DE615910182>] +/Encrypt 9 0 R +/Size 10 +>> +startxref +754 +%%EOF