Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,466 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.iwork; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; +import java.util.Arrays; +import java.util.List; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Before; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Tests if the IWork parser parses the content and metadata properly of the supported formats. + */ +public class IWorkParserTest { + + private IWorkPackageParser iWorkParser; + private ParseContext parseContext; + + @Before + public void setUp() { + iWorkParser = new IWorkPackageParser(); + parseContext = new ParseContext(); + parseContext.set(Parser.class, new AutoDetectParser()); + } + + /** + * Check the given InputStream is not closed by the Parser (TIKA-1117). + * + * @throws Exception + */ + @Test + public void testStreamNotClosed() throws Exception { + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + iWorkParser.parse(input, handler, metadata, parseContext); + input.read(); // Will throw an Exception if the stream was already closed. + } + + @Test + public void testParseKeynote() throws Exception { + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + iWorkParser.parse(input, handler, metadata, parseContext); + + // Make sure enough keys came through + // (Exact numbers will vary based on composites) + assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 6); + List<String> metadataKeys = Arrays.asList(metadata.names()); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE)); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.SLIDE_COUNT.getName())); +// assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.SLIDE_COUNT.getName())); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName())); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName())); + + // Check the metadata values + assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("3", metadata.get(Metadata.SLIDE_COUNT)); + assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH)); + assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT)); + assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE)); + + String content = handler.toString(); + assertContains("A sample presentation", content); + assertContains("For the Apache Tika project", content); + assertContains("Slide 1", content); + assertContains("Some random text for the sake of testability.", content); + assertContains("A nice comment", content); + assertContains("A nice note", content); + + // test table data + assertContains("Cell one", content); + assertContains("Cell two", content); + assertContains("Cell three", content); + assertContains("Cell four", content); + assertContains("Cell 5", content); + assertContains("Cell six", content); + assertContains("7", content); + assertContains("Cell eight", content); + assertContains("5/5/1985", content); + } + + // TIKA-910 + @Test + public void testKeynoteTextBoxes() throws Exception { + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTextBoxes.key"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + iWorkParser.parse(input, handler, metadata, parseContext); + + String content = handler.toString(); + assertTrue(content.replaceAll("\\s+", " ").contains("text1 text2 text3")); + } + + // TIKA-910 + @Test + public void testKeynoteBulletPoints() throws Exception { + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testBulletPoints.key"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + iWorkParser.parse(input, handler, metadata, parseContext); + + String content = handler.toString(); + assertTrue(content.replaceAll("\\s+", " ").contains("bullet point 1 bullet point 2 bullet point 3")); + } + + // TIKA-923 + @Test + public void testKeynoteTables() throws Exception { + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTables.key"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + iWorkParser.parse(input, handler, metadata, parseContext); + + String content = handler.toString(); + content = content.replaceAll("\\s+", " "); + assertContains("row 1 row 2 row 3", content); + } + + // TIKA-923 + @Test + public void testKeynoteMasterSlideTable() throws Exception { + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testMasterSlideTable.key"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + iWorkParser.parse(input, handler, metadata, parseContext); + + String content = handler.toString(); + content = content.replaceAll("\\s+", " "); + assertContains("master row 1", content); + assertContains("master row 2", content); + assertContains("master row 3", content); + } + + @Test + public void testParsePages() throws Exception { + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPages.pages"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + iWorkParser.parse(input, handler, metadata, parseContext); + + // Make sure enough keys came through + // (Exact numbers will vary based on composites) + assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 50); + List<String> metadataKeys = Arrays.asList(metadata.names()); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE)); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName())); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName())); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName())); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LAST_MODIFIED.getName())); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LANGUAGE)); + + // Check the metadata values + assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("2010-05-09T21:34:38+0200", metadata.get(Metadata.CREATION_DATE)); + assertEquals("2010-05-09T23:50:36+0200", metadata.get(Metadata.LAST_MODIFIED)); + assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE)); + assertEquals("2", metadata.get(Metadata.PAGE_COUNT)); + + String content = handler.toString(); + + // text on page 1 + assertContains("Sample pages document", content); + assertContains("Some plain text to parse.", content); + assertContains("Cell one", content); + assertContains("Cell two", content); + assertContains("Cell three", content); + assertContains("Cell four", content); + assertContains("Cell five", content); + assertContains("Cell six", content); + assertContains("Cell seven", content); + assertContains("Cell eight", content); + assertContains("Cell nine", content); + assertContains("Both Pages 1.x and Keynote 2.x", content); // ... + + // text on page 2 + assertContains("A second page....", content); + assertContains("Extensible Markup Language", content); // ... + } + + // TIKA-904 + @Test + public void testPagesLayoutMode() throws Exception { + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesLayout.pages"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + + iWorkParser.parse(input, handler, metadata, parseContext); + + String content = handler.toString(); + assertContains("text box 1 - here is some text", content); + assertContains("created in a text box in layout mode", content); + assertContains("text box 2 - more text!@!$@#", content); + assertContains("this is text inside of a green box", content); + assertContains("text inside of a green circle", content); + } + + @Test + public void testParseNumbers() throws Exception { + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbers.numbers"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + + iWorkParser.parse(input, handler, metadata, parseContext); + + // Make sure enough keys came through + // (Exact numbers will vary based on composites) + assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 8); + List<String> metadataKeys = Arrays.asList(metadata.names()); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE)); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName())); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName())); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.COMMENTS.getName())); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.TITLE)); + assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName())); + + // Check the metadata values + assertEquals("2", metadata.get(Metadata.PAGE_COUNT)); + assertEquals("Tika User", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Account checking", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("a comment", metadata.get(TikaCoreProperties.COMMENTS)); + + String content = handler.toString(); + assertContains("Category", content); + assertContains("Home", content); + assertContains("-226", content); + assertContains("-137.5", content); + assertContains("Checking Account: 300545668", content); + assertContains("4650", content); + assertContains("Credit Card", content); + assertContains("Groceries", content); + assertContains("-210", content); + assertContains("Food", content); + assertContains("Try adding your own account transactions to this table.", content); + } + + // TIKA- 924 + @Test + public void testParseNumbersTableNames() throws Exception { + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableNames.numbers"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + iWorkParser.parse(input, handler, metadata, parseContext); + String content = handler.toString(); + assertContains("This is the main table", content); + } + + @Test + public void testParseNumbersTableHeaders() throws Exception { + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableHeaders.numbers"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + iWorkParser.parse(input, handler, metadata, parseContext); + + String content = handler.toString(); + for(int header=1;header<=5;header++) { + assertContains("header" + header, content); + } + for(int row=1;row<=3;row++) { + assertContains("row" + row, content); + } + } + + /** + * We don't currently support password protected Pages files, as + * we don't know how the encryption works (it's not regular Zip + * Encryption). See TIKA-903 for details + */ + @Test + public void testParsePagesPasswordProtected() throws Exception { + // Document password is "tika", but we can't use that yet... + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesPwdProtected.pages"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + + iWorkParser.parse(input, handler, metadata, parseContext); + + // Content will be empty + String content = handler.toString(); + assertEquals("", content); + + // Will have been identified as encrypted + assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE)); + } + + /** + * Check we get headers, footers and footnotes from Pages + */ + @Test + public void testParsePagesHeadersFootersFootnotes() throws Exception { + String footnote = "Footnote: Do a lot of people really use iWork?!?!"; + String header = "THIS IS SOME HEADER TEXT"; + String footer = "THIS IS SOME FOOTER TEXT\t1"; + String footer2 = "THIS IS SOME FOOTER TEXT\t2"; + + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersFootnotes.pages"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + + iWorkParser.parse(input, handler, metadata, parseContext); + String contents = handler.toString(); + + // Check regular text + assertContains("Both Pages 1.x", contents); // P1 + assertContains("understanding the Pages document", contents); // P1 + assertContains("should be page 2", contents); // P2 + + // Check for headers, footers and footnotes + assertContains(header, contents); + assertContains(footer, contents); + assertContains(footer2, contents); + assertContains(footnote, contents); + } + + /** + * Check we get upper-case Roman numerals within the footer for AutoPageNumber. + */ + @Test + public void testParsePagesHeadersFootersRomanUpper() throws Exception { + String header = "THIS IS SOME HEADER TEXT"; + String footer = "THIS IS SOME FOOTER TEXT\tI"; + String footer2 = "THIS IS SOME FOOTER TEXT\tII"; + + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanUpper.pages"); + ContentHandler handler = new BodyContentHandler(); + + iWorkParser.parse(input, handler, new Metadata(), parseContext); + String contents = handler.toString(); + + // Check for headers, footers and footnotes + assertContains(header, contents); + assertContains(footer, contents); + assertContains(footer2, contents); + } + + /** + * Check we get lower-case Roman numerals within the footer for AutoPageNumber. + */ + @Test + public void testParsePagesHeadersFootersRomanLower() throws Exception { + String header = "THIS IS SOME HEADER TEXT"; + String footer = "THIS IS SOME FOOTER TEXT\ti"; + String footer2 = "THIS IS SOME FOOTER TEXT\tii"; + + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanLower.pages"); + ContentHandler handler = new BodyContentHandler(); + + iWorkParser.parse(input, handler, new Metadata(), parseContext); + String contents = handler.toString(); + + // Check for headers, footers and footnotes + assertContains(header, contents); + assertContains(footer, contents); + assertContains(footer2, contents); + } + + /** + * Check we get upper-case alpha-numeric letters within the footer for AutoPageNumber. + */ + @Test + public void testParsePagesHeadersAlphaUpper() throws Exception { + String header = "THIS IS SOME HEADER TEXT\tA"; + String footer = "THIS IS SOME FOOTER TEXT\tA"; + String footer2 = "THIS IS SOME FOOTER TEXT\tB"; + + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaUpper.pages"); + ContentHandler handler = new BodyContentHandler(); + + iWorkParser.parse(input, handler, new Metadata(), parseContext); + String contents = handler.toString(); + + // Check for headers, footers and footnotes + assertContains(header, contents); + assertContains(footer, contents); + assertContains(footer2, contents); + } + + /** + * Check we get lower-case alpha-numeric letters within the footer for AutoPageNumber. + */ + @Test + public void testParsePagesHeadersAlphaLower() throws Exception { + String header = "THIS IS SOME HEADER TEXT"; + String footer = "THIS IS SOME FOOTER TEXT\ta"; + String footer2 = "THIS IS SOME FOOTER TEXT\tb"; + + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaLower.pages"); + ContentHandler handler = new BodyContentHandler(); + + iWorkParser.parse(input, handler, new Metadata(), parseContext); + String contents = handler.toString(); + + // Check for headers, footers and footnotes + assertContains(header, contents); + assertContains(footer, contents); + assertContains(footer2, contents); + } + + /** + * Check we get annotations (eg comments) from Pages + */ + @Test + public void testParsePagesAnnotations() throws Exception { + String commentA = "comment about the APXL file"; + String commentB = "comment about UIMA"; + + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesComments.pages"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + + iWorkParser.parse(input, handler, metadata, parseContext); + String contents = handler.toString(); + + // Check regular text + assertContains("Both Pages 1.x", contents); // P1 + assertContains("understanding the Pages document", contents); // P1 + assertContains("should be page 2", contents); // P2 + + // Check for comments + assertContains(commentA, contents); + assertContains(commentB, contents); + } + + // TIKA-918 + @Test + public void testNumbersExtractChartNames() throws Exception { + InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbersCharts.numbers"); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + iWorkParser.parse(input, handler, metadata, parseContext); + String contents = handler.toString(); + assertContains("Expenditure by Category", contents); + assertContains("Currency Chart name", contents); + assertContains("Chart 2", contents); + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.junit.Before; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Parent class for all Package based Test cases + */ +public abstract class AbstractPkgTest extends TikaTest { + protected ParseContext trackingContext; + protected ParseContext recursingContext; + + protected Parser autoDetectParser; + protected EmbeddedTrackingParser tracker; + + @Before + public void setUp() throws Exception { + tracker = new EmbeddedTrackingParser(); + trackingContext = new ParseContext(); + trackingContext.set(Parser.class, tracker); + + autoDetectParser = new AutoDetectParser(); + recursingContext = new ParseContext(); + recursingContext.set(Parser.class, autoDetectParser); + } + + + @SuppressWarnings("serial") + protected static class EmbeddedTrackingParser extends AbstractParser { + protected List<String> filenames = new ArrayList<String>(); + protected List<String> mediatypes = new ArrayList<String>(); + protected List<String> createdAts = new ArrayList<String>(); + protected List<String> modifiedAts = new ArrayList<String>(); + protected byte[] lastSeenStart; + + public void reset() { + filenames.clear(); + mediatypes.clear(); + createdAts.clear(); + modifiedAts.clear(); + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + // Cheat! + return (new AutoDetectParser()).getSupportedTypes(context); + } + + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY)); + mediatypes.add(metadata.get(Metadata.CONTENT_TYPE)); + createdAts.add(metadata.get(TikaCoreProperties.CREATED)); + modifiedAts.add(metadata.get(TikaCoreProperties.MODIFIED)); + + lastSeenStart = new byte[32]; + stream.read(lastSeenStart); + } + + } +} Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,110 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.pkg; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class ArParserTest extends AbstractPkgTest { + @Test + public void testArParsing() throws Exception { + Parser parser = new AutoDetectParser(); + + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = ArParserTest.class.getResourceAsStream( + "/test-documents/testARofText.ar")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-archive", + metadata.get(Metadata.CONTENT_TYPE)); + + try (InputStream stream = ArParserTest.class.getResourceAsStream( + "/test-documents/testARofSND.ar")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-archive", + metadata.get(Metadata.CONTENT_TYPE)); + } + + /** + * Tests that the ParseContext parser is correctly fired for all the + * embedded entries. + */ + @Test + public void testEmbedded() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = ArParserTest.class.getResourceAsStream( + "/test-documents/testARofText.ar")) { + parser.parse(stream, handler, metadata, trackingContext); + } + + assertEquals(1, tracker.filenames.size()); + assertEquals(1, tracker.mediatypes.size()); + assertEquals(1, tracker.modifiedAts.size()); + + assertEquals("testTXT.txt", tracker.filenames.get(0)); + + String modifiedAt = tracker.modifiedAts.get(0); + assertTrue("Modified at " + modifiedAt, modifiedAt.startsWith("201")); + + for (String type : tracker.mediatypes) { + assertNull(type); + } + for(String crt : tracker.createdAts) { + assertNull(crt); + } + + tracker.reset(); + try (InputStream stream = ArParserTest.class.getResourceAsStream( + "/test-documents/testARofSND.ar")) { + parser.parse(stream, handler, metadata, trackingContext); + } + + assertEquals(1, tracker.filenames.size()); + assertEquals(1, tracker.mediatypes.size()); + assertEquals(1, tracker.modifiedAts.size()); + assertEquals("testAU.au", tracker.filenames.get(0)); + + modifiedAt = tracker.modifiedAts.get(0); + assertTrue("Modified at " + modifiedAt, modifiedAt.startsWith("201")); + + for (String type : tracker.mediatypes) { + assertNull(type); + } + for(String crt : tracker.createdAts) { + assertNull(crt); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static java.nio.charset.StandardCharsets.US_ASCII; +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Test case for parsing bzip2 files. + */ +public class Bzip2ParserTest extends AbstractPkgTest { + + @Test + public void testBzip2Parsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = Bzip2ParserTest.class.getResourceAsStream( + "/test-documents/test-documents.tbz2")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("test-documents/testHTML.html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("test-documents/testXML.xml", content); + } + + + /** + * Tests that the ParseContext parser is correctly + * fired for all the embedded entries. + */ + @Test + public void testEmbedded() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = ZipParserTest.class.getResourceAsStream( + "/test-documents/test-documents.tbz2")) { + parser.parse(stream, handler, metadata, trackingContext); + } + + // Should find a single entry, for the (compressed) tar file + assertEquals(1, tracker.filenames.size()); + assertEquals(1, tracker.mediatypes.size()); + assertEquals(1, tracker.modifiedAts.size()); + + assertEquals(null, tracker.filenames.get(0)); + assertEquals(null, tracker.mediatypes.get(0)); + assertEquals(null, tracker.createdAts.get(0)); + assertEquals(null, tracker.modifiedAts.get(0)); + + // Tar file starts with the directory name + assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII)); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static java.nio.charset.StandardCharsets.US_ASCII; +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Test case for parsing compress (.Z) files. + */ +public class CompressParserTest extends AbstractPkgTest { + @Test + public void testCompressParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + InputStream stream = TarParserTest.class.getResourceAsStream( + "/test-documents/test-documents.tar.Z"); + try { + parser.parse(stream, handler, metadata, recursingContext); + } finally { + stream.close(); + } + + assertEquals("application/x-compress", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("test-documents/testHTML.html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("test-documents/testXML.xml", content); + } + + /** + * Tests that the ParseContext parser is correctly + * fired for all the embedded entries. + */ + @Test + public void testEmbedded() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + InputStream stream = ZipParserTest.class.getResourceAsStream( + "/test-documents/test-documents.tar.Z"); + try { + parser.parse(stream, handler, metadata, trackingContext); + } finally { + stream.close(); + } + + // Should find a single entry, for the (compressed) tar file + assertEquals(1, tracker.filenames.size()); + assertEquals(1, tracker.mediatypes.size()); + assertEquals(1, tracker.modifiedAts.size()); + + assertEquals(null, tracker.filenames.get(0)); + assertEquals(null, tracker.mediatypes.get(0)); + assertEquals(null, tracker.modifiedAts.get(0)); + + // Tar file starts with the directory name + assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII)); + } +} \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static java.nio.charset.StandardCharsets.US_ASCII; +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Test case for parsing gzip files. + */ +public class GzipParserTest extends AbstractPkgTest { + + @Test + public void testGzipParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = GzipParserTest.class.getResourceAsStream( + "/test-documents/test-documents.tgz")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("test-documents/testHTML.html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("test-documents/testXML.xml", content); + } + + /** + * Tests that the ParseContext parser is correctly + * fired for all the embedded entries. + */ + @Test + public void testEmbedded() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = ZipParserTest.class.getResourceAsStream( + "/test-documents/test-documents.tgz")) { + parser.parse(stream, handler, metadata, trackingContext); + } + + // Should find a single entry, for the (compressed) tar file + assertEquals(1, tracker.filenames.size()); + assertEquals(1, tracker.mediatypes.size()); + assertEquals(1, tracker.modifiedAts.size()); + + assertEquals(null, tracker.filenames.get(0)); + assertEquals(null, tracker.mediatypes.get(0)); + assertEquals(null, tracker.modifiedAts.get(0)); + + // Tar file starts with the directory name + assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII)); + } + + @Test + public void testSvgzParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = GzipParserTest.class.getResourceAsStream( + "/test-documents/testSVG.svgz")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE)); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Test case for parsing rar files. + */ +public class RarParserTest extends AbstractPkgTest { + + @Test + public void testRarParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = RarParserTest.class.getResourceAsStream( + "/test-documents/test-documents.rar")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("test-documents/testHTML.html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("test-documents/testXML.xml", content); + } + + /** + * Tests that the ParseContext parser is correctly + * fired for all the embedded entries. + */ + @Test + public void testEmbedded() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = RarParserTest.class.getResourceAsStream( + "/test-documents/test-documents.rar")) { + parser.parse(stream, handler, metadata, trackingContext); + } + + // Should have found all 9 documents, but not the directory + assertEquals(9, tracker.filenames.size()); + assertEquals(9, tracker.mediatypes.size()); + assertEquals(9, tracker.modifiedAts.size()); + + // Should have names but not content types, as rar doesn't + // store the content types + assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0)); + assertEquals("test-documents/testHTML.html", tracker.filenames.get(1)); + assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2)); + assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3)); + assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4)); + assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5)); + assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6)); + assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7)); + assertEquals("test-documents/testXML.xml", tracker.filenames.get(8)); + + for(String type : tracker.mediatypes) { + assertNull(type); + } + for(String crt : tracker.createdAts) { + assertNull(crt); + } + for(String mod : tracker.modifiedAts) { + assertNotNull(mod); + assertTrue("Modified at " + mod, mod.startsWith("20")); + } + + // Should have filenames in the content string + String content = handler.toString(); + assertContains("test-documents/testHTML.html", content); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("test-documents/testXML.xml", content); + } +} \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import javax.crypto.Cipher; + +import java.io.InputStream; +import java.security.NoSuchAlgorithmException; + +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Test case for parsing 7z files. + */ +public class Seven7ParserTest extends AbstractPkgTest { + private static final MediaType TYPE_7ZIP = MediaType.application("x-7z-compressed"); + + @Test + public void test7ZParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + // Ensure 7zip is a parsable format + assertTrue("No 7zip parser found", + parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP)); + + // Parse + try (InputStream stream = Seven7ParserTest.class.getResourceAsStream( + "/test-documents/test-documents.7z")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("test-documents/testHTML.html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("test-documents/testXML.xml", content); + } + + /** + * Tests that the ParseContext parser is correctly + * fired for all the embedded entries. + */ + @Test + public void testEmbedded() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = Seven7ParserTest.class.getResourceAsStream( + "/test-documents/test-documents.7z")) { + parser.parse(stream, handler, metadata, trackingContext); + } + + // Should have found all 9 documents, but not the directory + assertEquals(9, tracker.filenames.size()); + assertEquals(9, tracker.mediatypes.size()); + assertEquals(9, tracker.modifiedAts.size()); + + // Should have names but not content types, as 7z doesn't + // store the content types + assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0)); + assertEquals("test-documents/testHTML.html", tracker.filenames.get(1)); + assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2)); + assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3)); + assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4)); + assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5)); + assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6)); + assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7)); + assertEquals("test-documents/testXML.xml", tracker.filenames.get(8)); + + for(String type : tracker.mediatypes) { + assertNull(type); + } + for(String mod : tracker.modifiedAts) { + assertNotNull(mod); + assertTrue("Modified at " + mod, mod.startsWith("20")); + } + } + + @Test + public void testPasswordProtected() throws Exception { + Parser parser = new AutoDetectParser(); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + // No password, will fail with EncryptedDocumentException + boolean ex = false; + try (InputStream stream = Seven7ParserTest.class.getResourceAsStream( + "/test-documents/test7Z_protected_passTika.7z")) { + parser.parse(stream, handler, metadata, recursingContext); + fail("Shouldn't be able to read a password protected 7z without the password"); + } catch (EncryptedDocumentException e) { + // Good + ex = true; + } + + assertTrue("test no password", ex); + + ex = false; + + // Wrong password currently silently gives no content + // Ideally we'd like Commons Compress to give an error, but it doesn't... + recursingContext.set(PasswordProvider.class, new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "wrong"; + } + }); + handler = new BodyContentHandler(); + try (InputStream stream = Seven7ParserTest.class.getResourceAsStream( + "/test-documents/test7Z_protected_passTika.7z")) { + parser.parse(stream, handler, metadata, recursingContext); + fail("Shouldn't be able to read a password protected 7z with wrong password"); + } catch (TikaException e) { + //if JCE is installed, the cause will be: Caused by: org.tukaani.xz.CorruptedInputException: Compressed data is corrupt + //if JCE is not installed, the message will include + // "(do you have the JCE Unlimited Strength Jurisdiction Policy Files installed?") + ex = true; + } + assertTrue("TikaException for bad password", ex); + // Will be empty + assertEquals("", handler.toString()); + + ex = false; + // Right password works fine if JCE Unlimited Strength has been installed!!! + if (isStrongCryptoAvailable()) { + recursingContext.set(PasswordProvider.class, new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "Tika"; + } + }); + handler = new BodyContentHandler(); + try (InputStream stream = Seven7ParserTest.class.getResourceAsStream( + "/test-documents/test7Z_protected_passTika.7z")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + + // Should get filename + assertContains("text.txt", content); + + // Should get contents from the text file in the 7z file + assertContains("TEST DATA FOR TIKA.", content); + assertContains("This is text inside an encrypted 7zip (7z) file.", content); + assertContains("It should be processed by Tika just fine!", content); + assertContains("TIKA-1521", content); + } else { + //if jce is not installed, test for IOException wrapped in TikaException + boolean ioe = false; + recursingContext.set(PasswordProvider.class, new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "Tika"; + } + }); + handler = new BodyContentHandler(); + try (InputStream stream = Seven7ParserTest.class.getResourceAsStream( + "/test-documents/test7Z_protected_passTika.7z")) { + parser.parse(stream, handler, metadata, recursingContext); + } catch (TikaException e) { + ioe = true; + } + assertTrue("IOException because JCE was not installed", ioe); + } + } + + private static boolean isStrongCryptoAvailable() throws NoSuchAlgorithmException { + return Cipher.getMaxAllowedKeyLength("AES/ECB/PKCS5Padding") >= 256; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Test case for parsing tar files. + */ +public class TarParserTest extends AbstractPkgTest { + + @Test + public void testTarParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = TarParserTest.class.getResourceAsStream( + "/test-documents/test-documents.tar")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("test-documents/testHTML.html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("test-documents/testXML.xml", content); + } + + /** + * Tests that the ParseContext parser is correctly + * fired for all the embedded entries. + */ + @Test + public void testEmbedded() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = ZipParserTest.class.getResourceAsStream( + "/test-documents/test-documents.tar")) { + parser.parse(stream, handler, metadata, trackingContext); + } + + // Should have found all 9 documents, but not the directory + assertEquals(9, tracker.filenames.size()); + assertEquals(9, tracker.mediatypes.size()); + assertEquals(9, tracker.modifiedAts.size()); + + // Should have names but not content types, as tar doesn't + // store the content types + assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0)); + assertEquals("test-documents/testHTML.html", tracker.filenames.get(1)); + assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2)); + assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3)); + assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4)); + assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5)); + assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6)); + assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7)); + assertEquals("test-documents/testXML.xml", tracker.filenames.get(8)); + + for(String type : tracker.mediatypes) { + assertNull(type); + } + for(String crt : tracker.createdAts) { + assertNull(crt); + } + for(String mod : tracker.modifiedAts) { + assertNotNull(mod); + assertTrue("Modified at " + mod, mod.startsWith("20")); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.compress.archivers.ArchiveStreamFactory; +import org.apache.tika.Tika; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Test case for parsing zip files. + */ +public class ZipParserTest extends AbstractPkgTest { + + @Test + public void testZipParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = ZipParserTest.class.getResourceAsStream( + "/test-documents/test-documents.zip")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("testEXCEL.xls", content); + assertContains("testHTML.html", content); + assertContains("testOpenOffice2.odt", content); + assertContains("testPDF.pdf", content); + assertContains("testPPT.ppt", content); + assertContains("testRTF.rtf", content); + assertContains("testTXT.txt", content); + assertContains("testWORD.doc", content); + assertContains("testXML.xml", content); + } + + /** + * Tests that the ParseContext parser is correctly + * fired for all the embedded entries. + */ + @Test + public void testEmbedded() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = ZipParserTest.class.getResourceAsStream( + "/test-documents/test-documents.zip")) { + parser.parse(stream, handler, metadata, trackingContext); + } + + // Should have found all 9 documents + assertEquals(9, tracker.filenames.size()); + assertEquals(9, tracker.mediatypes.size()); + assertEquals(9, tracker.modifiedAts.size()); + + // Should have names and modified dates, but not content types, + // as zip doesn't store the content types + assertEquals("testEXCEL.xls", tracker.filenames.get(0)); + assertEquals("testHTML.html", tracker.filenames.get(1)); + assertEquals("testOpenOffice2.odt", tracker.filenames.get(2)); + assertEquals("testPDF.pdf", tracker.filenames.get(3)); + assertEquals("testPPT.ppt", tracker.filenames.get(4)); + assertEquals("testRTF.rtf", tracker.filenames.get(5)); + assertEquals("testTXT.txt", tracker.filenames.get(6)); + assertEquals("testWORD.doc", tracker.filenames.get(7)); + assertEquals("testXML.xml", tracker.filenames.get(8)); + + for(String type : tracker.mediatypes) { + assertNull(type); + } + for(String crt : tracker.createdAts) { + assertNull(crt); + } + for(String mod : tracker.modifiedAts) { + assertNotNull(mod); + assertTrue("Modified at " + mod, mod.startsWith("20")); + } + } + + /** + * Test case for the ability of the ZIP parser to extract the name of + * a ZIP entry even if the content of the entry is unreadable due to an + * unsupported compression method. + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-346">TIKA-346</a> + */ + @Test + public void testUnsupportedZipCompressionMethod() throws Exception { + String content = new Tika().parseToString( + ZipParserTest.class.getResourceAsStream( + "/test-documents/moby.zip")); + assertContains("README", content); + } + + private class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor { + public Set<String> allRelIDs = new HashSet<String>(); + public boolean shouldParseEmbedded(Metadata metadata) { + String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID); + if (relID != null) { + allRelIDs.add(relID); + } + return false; + } + + public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) { + throw new UnsupportedOperationException("should never be called"); + } + } + + // TIKA-1036 + @Test + public void testPlaceholders() throws Exception { + String xml = getXML("testEmbedded.zip").xml; + assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml); + assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml); + + // Also make sure EMBEDDED_RELATIONSHIP_ID was + // passed when parsing the embedded docs: + Parser parser = new AutoDetectParser(); + ParseContext context = new ParseContext(); + context.set(Parser.class, parser); + GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor(); + context.set(EmbeddedDocumentExtractor.class, relIDs); + try (InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip")) { + parser.parse(input, + new BodyContentHandler(), + new Metadata(), + context); + } + + assertTrue(relIDs.allRelIDs.contains("test1.txt")); + assertTrue(relIDs.allRelIDs.contains("test2.txt")); + } + + @Test // TIKA-936 + public void testCustomEncoding() throws Exception { + ArchiveStreamFactory factory = new ArchiveStreamFactory(); + factory.setEntryEncoding("SJIS"); + trackingContext.set(ArchiveStreamFactory.class, factory); + + try (InputStream stream = TikaInputStream.get(Base64.decodeBase64( + "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50" + + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh" + + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA" + + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) { + autoDetectParser.parse( + stream, new DefaultHandler(), + new Metadata(), trackingContext); + } + + assertEquals(1, tracker.filenames.size()); + assertEquals( + "\u65E5\u672C\u8A9E\u30E1\u30E2.txt", + tracker.filenames.get(0)); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-package-parser-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +/** + * Test case for parsing zlib compressed + */ +public class ZlibParserTest extends AbstractPkgTest { + @Test + public void testZlibParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = ZipParserTest.class.getResourceAsStream( + "/test-documents/testTXT.zlib")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE)); + } + + /** + * Tests that the ParseContext parser is correctly + * fired for all the embedded entries. + */ + @Test + public void testEmbedded() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = ZipParserTest.class.getResourceAsStream( + "/test-documents/testTXT.zlib")) { + parser.parse(stream, handler, metadata, trackingContext); + } + + // Should have found a single text document inside + assertEquals(1, tracker.filenames.size()); + assertEquals(1, tracker.mediatypes.size()); + assertEquals(1, tracker.modifiedAts.size()); + + // Won't have names, dates or types, as zlib doesn't have that + assertEquals(null, tracker.filenames.get(0)); + assertEquals(null, tracker.mediatypes.get(0)); + assertEquals(null, tracker.createdAts.get(0)); + assertEquals(null, tracker.modifiedAts.get(0)); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/pom.xml?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/pom.xml (added) +++ tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/pom.xml Sat Jan 16 18:23:01 2016 @@ -0,0 +1,106 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-pdf-parser-module</artifactId> + <name>Apache Tika PDF Parser Module</name> + <url>http://tika.apache.org/</url> + + <properties> + <commons.logging.version>1.1.3</commons.logging.version> + </properties> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>${commons.io.version}</version> + </dependency> + <dependency> + <groupId>org.apache.pdfbox</groupId> + <artifactId>pdfbox</artifactId> + <version>${pdfbox.version}</version> + </dependency> + <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies + as optional, but we prefer to have them always to avoid + problems with encrypted PDFs. --> + <dependency> + <groupId>org.bouncycastle</groupId> + <artifactId>bcmail-jdk15on</artifactId> + <version>1.52</version> + </dependency> + <dependency> + <groupId>org.bouncycastle</groupId> + <artifactId>bcprov-jdk15on</artifactId> + <version>1.52</version> + </dependency> + <dependency> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + <version>${commons.logging.version}</version> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-package-parser-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-text-parser-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-office-parser-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java?rev=1725014&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java Sat Jan 16 18:23:01 2016 @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.pdf; + +import java.io.Serializable; + +import org.apache.tika.exception.AccessPermissionException; +import org.apache.tika.metadata.AccessPermissions; +import org.apache.tika.metadata.Metadata; + +/** + * Checks whether or not a document allows extraction generally + * or extraction for accessibility only. + */ +public class AccessChecker implements Serializable { + + private static final long serialVersionUID = 6492570218190936986L; + + private final boolean needToCheck; + private final boolean allowAccessibility; + + /** + * This constructs an {@link AccessChecker} that + * will not perform any checking and will always return without + * throwing an exception. + * <p/> + * This constructor is available to allow for Tika's legacy ( <= v1.7) behavior. + */ + public AccessChecker() { + needToCheck = false; + allowAccessibility = true; + } + + /** + * This constructs an {@link AccessChecker} that will check + * for whether or not content should be extracted from a document. + * + * @param allowExtractionForAccessibility if general extraction is not allowed, is extraction for accessibility allowed + */ + public AccessChecker(boolean allowExtractionForAccessibility) { + needToCheck = true; + this.allowAccessibility = allowExtractionForAccessibility; + } + + /** + * Checks to see if a document's content should be extracted based + * on metadata values and the value of {@link #allowAccessibility} in the constructor. + * + * @param metadata + * @throws AccessPermissionException if access is not permitted + */ + public void check(Metadata metadata) throws AccessPermissionException { + if (!needToCheck) { + return; + } + if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) { + if (allowAccessibility) { + if ("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) { + return; + } + throw new AccessPermissionException("Content extraction for accessibility is not allowed."); + } + throw new AccessPermissionException("Content extraction is not allowed."); + } + } +}
