Author: nick
Date: Sun Jan 18 23:15:48 2015
New Revision: 1652869
URL: http://svn.apache.org/r1652869
Log:
TIKA-1521 Support password protected 7zip files
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/test7Z_protected_passTika.7z
(with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1652869&r1=1652868&r2=1652869&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Jan 18 23:15:48 2015
@@ -1,5 +1,10 @@
Release 1.8 - Current Development
+ * Support password protected 7zip files (using a PasswordProvider,
+ in keeping with the other password supporting formats) (TIKA-1521)
+
+ * Password protected Zip files should not trigger an exception (TIKA-1028)
+
Release 1.7 - 1/9/2015
* Fixed resource leak in OutlookPSTParser that caused TikaException
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1652869&r1=1652868&r2=1652869&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
Sun Jan 18 23:15:48 2015
@@ -48,6 +48,7 @@ import org.apache.tika.metadata.TikaCore
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -131,9 +132,22 @@ public class PackageParser extends Abstr
stream.reset();
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
- // Pending a fix for COMPRESS-269, this bit is a little nasty
- ais = new SevenZWrapper(new SevenZFile(tstream.getFile()));
+ // Seven Zip suports passwords, was one given?
+ String password = null;
+ PasswordProvider provider =
context.get(PasswordProvider.class);
+ if (provider != null) {
+ password = provider.getPassword(metadata);
+ }
+
+ SevenZFile sevenz;
+ if (password == null) {
+ sevenz = new SevenZFile(tstream.getFile());
+ } else {
+ sevenz = new SevenZFile(tstream.getFile(),
password.getBytes("UnicodeLittleUnmarked"));
+ }
+ // Pending a fix for COMPRESS-269, this bit is a little nasty
+ ais = new SevenZWrapper(sevenz);
} else {
tmp.close();
throw new TikaException("Unknown non-streaming format " +
sne.getFormat(), sne);
@@ -170,6 +184,13 @@ public class PackageParser extends Abstr
throw new EncryptedDocumentException(zfe);
}
// Otherwise fall through to raise the exception as normal
+ } catch (IOException ie) {
+ // Is this a password protection error?
+ // (COMPRESS-298 should give a nicer way when implemented)
+ if ("Cannot read encrypted files without a
password".equals(ie.getMessage())) {
+ throw new EncryptedDocumentException();
+ }
+ // Otherwise fall through to raise the exception as normal
} finally {
ais.close();
tmp.close();
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java?rev=1652869&r1=1652868&r2=1652869&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
Sun Jan 18 23:15:48 2015
@@ -20,13 +20,16 @@ import static org.junit.Assert.assertEqu
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
import java.io.InputStream;
+import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -48,7 +51,7 @@ public class Seven7ParserTest extends Ab
parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
// Parse
- InputStream stream = TarParserTest.class.getResourceAsStream(
+ InputStream stream = Seven7ParserTest.class.getResourceAsStream(
"/test-documents/test-documents.7z");
try {
parser.parse(stream, handler, metadata, recursingContext);
@@ -88,7 +91,7 @@ public class Seven7ParserTest extends Ab
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
- InputStream stream = ZipParserTest.class.getResourceAsStream(
+ InputStream stream = Seven7ParserTest.class.getResourceAsStream(
"/test-documents/test-documents.7z");
try {
parser.parse(stream, handler, metadata, trackingContext);
@@ -121,4 +124,75 @@ public class Seven7ParserTest extends Ab
assertTrue("Modified at " + mod, mod.startsWith("20"));
}
}
+
+ @Test
+ public void testPasswordProtected() throws Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ // No password, will fail with EncryptedDocumentException
+ InputStream stream = Seven7ParserTest.class.getResourceAsStream(
+ "/test-documents/test7Z_protected_passTika.7z");
+ try {
+ parser.parse(stream, handler, metadata, recursingContext);
+ fail("Shouldn't be able to read a password protected 7z without
the password");
+ } catch (EncryptedDocumentException e) {
+ // Good
+ } finally {
+ stream.close();
+ }
+
+
+ // Wrong password currently silently gives no content
+ // Ideally we'd like Commons Compress to give an error, but it
doesn't...
+ recursingContext.set(PasswordProvider.class, new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "wrong";
+ }
+ });
+ handler = new BodyContentHandler();
+ stream = Seven7ParserTest.class.getResourceAsStream(
+ "/test-documents/test7Z_protected_passTika.7z");
+ try {
+ parser.parse(stream, handler, metadata, recursingContext);
+// fail("Shouldn't be able to read a password protected 7z with
wrong password");
+// } catch (EncryptedDocumentException e) {
+ } finally {
+ stream.close();
+ }
+
+ // Will be empty
+ assertEquals("", handler.toString());
+
+
+ // Right password works fine
+ recursingContext.set(PasswordProvider.class, new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "Tika";
+ }
+ });
+ handler = new BodyContentHandler();
+ stream = Seven7ParserTest.class.getResourceAsStream(
+ "/test-documents/test7Z_protected_passTika.7z");
+ try {
+ parser.parse(stream, handler, metadata, recursingContext);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals(TYPE_7ZIP.toString(),
metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+
+ // Should get filename
+ assertContains("text.txt", content);
+
+ // Should get contents from the text file in the 7z file
+ assertContains("TEST DATA FOR TIKA.", content);
+ assertContains("This is text inside an encrypted 7zip (7z) file.",
content);
+ assertContains("It should be processed by Tika just fine!", content);
+ assertContains("TIKA-1521", content);
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/test7Z_protected_passTika.7z
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test7Z_protected_passTika.7z?rev=1652869&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/test7Z_protected_passTika.7z
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream