Modified: tika/branches/2.x/tika-parsers/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parsers/pom.xml?rev=1723223&r1=1723222&r2=1723223&view=diff ============================================================================== --- tika/branches/2.x/tika-parsers/pom.xml (original) +++ tika/branches/2.x/tika-parsers/pom.xml Wed Jan 6 03:50:50 2016 @@ -43,8 +43,6 @@ <mime4j.version>0.7.2</mime4j.version> <vorbis.version>0.6</vorbis.version> <pdfbox.version>1.8.10</pdfbox.version> - <netcdf-java.version>4.5.5</netcdf-java.version> - <cxf.version>3.0.3</cxf.version> </properties> <dependencies> @@ -77,16 +75,6 @@ <artifactId>vorbis-java-tika</artifactId> <version>${vorbis.version}</version> </dependency> - <dependency> - <groupId>com.healthmarketscience.jackcess</groupId> - <artifactId>jackcess</artifactId> - <version>2.1.2</version> - </dependency> - <dependency> - <groupId>com.healthmarketscience.jackcess</groupId> - <artifactId>jackcess-encrypt</artifactId> - <version>2.1.1</version> - </dependency> <!-- Optional OSGi dependencies, used only when running within OSGi --> <dependency> @@ -95,27 +83,6 @@ <scope>provided</scope> </dependency> - <!-- Upstream parser libraries --> - <dependency> - <groupId>net.sourceforge.jmatio</groupId> - <artifactId>jmatio</artifactId> - <version>1.0</version> - </dependency> - <dependency> - <groupId>org.apache.james</groupId> - <artifactId>apache-mime4j-core</artifactId> - <version>${mime4j.version}</version> - </dependency> - <dependency> - <groupId>org.apache.james</groupId> - <artifactId>apache-mime4j-dom</artifactId> - <version>${mime4j.version}</version> - </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-compress</artifactId> - <version>${commons.compress.version}</version> - </dependency> <dependency> <groupId>org.tukaani</groupId> <artifactId>xz</artifactId> @@ -132,19 +99,6 @@ <artifactId>pdfbox</artifactId> <version>${pdfbox.version}</version> </dependency> - <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies - as optional, but we prefer to have them always to avoid - problems with encrypted PDFs. --> - <dependency> - <groupId>org.bouncycastle</groupId> - <artifactId>bcmail-jdk15on</artifactId> - <version>1.52</version> - </dependency> - <dependency> - <groupId>org.bouncycastle</groupId> - <artifactId>bcprov-jdk15on</artifactId> - <version>1.52</version> - </dependency> <dependency> <groupId>org.apache.poi</groupId> @@ -171,116 +125,98 @@ </exclusion> </exclusions> </dependency> + + + <!-- Apache cTAKES --> <dependency> - <groupId>org.ccil.cowan.tagsoup</groupId> - <artifactId>tagsoup</artifactId> - <version>1.2.1</version> + <groupId>org.apache.ctakes</groupId> + <artifactId>ctakes-core</artifactId> + <version>3.2.2</version> + <scope>provided</scope> </dependency> <dependency> - <groupId>org.ow2.asm</groupId> - <artifactId>asm</artifactId> - <version>5.0.4</version> + <groupId>org.xerial</groupId> + <artifactId>sqlite-jdbc</artifactId> + <version>3.8.10.1</version> + <scope>provided</scope> </dependency> - <dependency> - <groupId>com.googlecode.mp4parser</groupId> - <artifactId>isoparser</artifactId> - <version>1.0.2</version> + <groupId>org.gagravarr</groupId> + <artifactId>vorbis-java-core</artifactId> + <version>${vorbis.version}</version> </dependency> <dependency> - <groupId>com.drewnoakes</groupId> - <artifactId>metadata-extractor</artifactId> - <version>2.8.0</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-multimedia-module</artifactId> + <version>${project.version}</version> </dependency> <dependency> - <groupId>de.l3s.boilerpipe</groupId> - <artifactId>boilerpipe</artifactId> - <version>1.1.0</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-advanced-module</artifactId> + <version>${project.version}</version> </dependency> <dependency> - <groupId>rome</groupId> - <artifactId>rome</artifactId> - <version>1.0</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-cad-module</artifactId> + <version>${project.version}</version> </dependency> <dependency> - <groupId>org.gagravarr</groupId> - <artifactId>vorbis-java-core</artifactId> - <version>${vorbis.version}</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-code-module</artifactId> + <version>${project.version}</version> </dependency> <dependency> - <groupId>com.googlecode.juniversalchardet</groupId> - <artifactId>juniversalchardet</artifactId> - <version>1.0.3</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-database-module</artifactId> + <version>${project.version}</version> </dependency> <dependency> - <groupId>org.codelibs</groupId> - <artifactId>jhighlight</artifactId> - <version>1.0.2</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-ebook-module</artifactId> + <version>${project.version}</version> </dependency> <dependency> - <groupId>com.pff</groupId> - <artifactId>java-libpst</artifactId> - <version>0.8.1</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-journal-module</artifactId> + <version>${project.version}</version> </dependency> <dependency> - <groupId>com.github.junrar</groupId> - <artifactId>junrar</artifactId> - <version>0.7</version> - </dependency> - <dependency> - <groupId>org.apache.cxf</groupId> - <artifactId>cxf-rt-rs-client</artifactId> - <version>${cxf.version}</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-office-module</artifactId> + <version>${project.version}</version> </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-multimedia-module</artifactId> + <artifactId>tika-package-module</artifactId> <version>${project.version}</version> </dependency> - - <!-- Provided dependencies --> <dependency> - <groupId>org.xerial</groupId> - <artifactId>sqlite-jdbc</artifactId> - <version>3.8.10.1</version> - <scope>provided</scope> + <groupId>${project.groupId}</groupId> + <artifactId>tika-pdf-module</artifactId> + <version>${project.version}</version> </dependency> - <dependency> - <groupId>org.apache.opennlp</groupId> - <artifactId>opennlp-tools</artifactId> - <version>1.5.3</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-scientific-module</artifactId> + <version>${project.version}</version> </dependency> - <dependency> - <groupId>commons-io</groupId> - <artifactId>commons-io</artifactId> - <version>${commons.io.version}</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-text-module</artifactId> + <version>${project.version}</version> </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-exec</artifactId> - <version>1.3</version> + <groupId>${project.groupId}</groupId> + <artifactId>tika-web-module</artifactId> + <version>${project.version}</version> </dependency> - <dependency> - <groupId>com.googlecode.json-simple</groupId> - <artifactId>json-simple</artifactId> - <version>1.1.1</version> - <exclusions> - <exclusion> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - </exclusion> - </exclusions> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>${commons.io.version}</version> </dependency> - <dependency> - <groupId>org.json</groupId> - <artifactId>json</artifactId> - <version>20140107</version> - </dependency> + <!-- Test dependencies --> <dependency> @@ -305,68 +241,11 @@ <scope>test</scope> <type>test-jar</type> </dependency> - - <!-- edu.ucar dependencies --> - <dependency> - <groupId>edu.ucar</groupId> - <artifactId>netcdf4</artifactId> - <version>${netcdf-java.version}</version> - </dependency> - <dependency> - <groupId>edu.ucar</groupId> - <artifactId>grib</artifactId> - <version>${netcdf-java.version}</version> - </dependency> - <dependency> - <groupId>edu.ucar</groupId> - <artifactId>cdm</artifactId> - <version>${netcdf-java.version}</version> - <exclusions> - <exclusion> - <groupId>org.slf4j</groupId> - <artifactId>jcl-over-slf4j</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>edu.ucar</groupId> - <artifactId>httpservices</artifactId> - <version>${netcdf-java.version}</version> - </dependency> - <!-- Apache Commons CSV --> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-csv</artifactId> - <version>1.0</version> - </dependency> - - <dependency> - <groupId>org.apache.sis.core</groupId> - <artifactId>sis-utility</artifactId> - <version>0.5</version> - </dependency> - <dependency> - <groupId>org.apache.sis.storage</groupId> - <artifactId>sis-netcdf</artifactId> - <version>0.5</version> - </dependency> - <dependency> - <groupId>org.apache.sis.core</groupId> - <artifactId>sis-metadata</artifactId> - <version>0.5</version> - </dependency> <dependency> <groupId>org.opengis</groupId> <artifactId>geoapi</artifactId> <version>3.0.0</version> </dependency> - <!-- Apache cTAKES --> - <dependency> - <groupId>org.apache.ctakes</groupId> - <artifactId>ctakes-core</artifactId> - <version>3.2.2</version> - <scope>provided</scope> - </dependency> </dependencies> <build> @@ -441,8 +320,31 @@ <artifactSet> <includes> <include>org.apache.tika:tika-multimedia-module</include> + <include>org.apache.tika:tika-advanced-module</include> + <include>org.apache.tika:tika-cad-module</include> + <include>org.apache.tika:tika-code-module</include> + <include>org.apache.tika:tika-database-module</include> + <include>org.apache.tika:tika-ebook-module</include> + <include>org.apache.tika:tika-journal-module</include> + <include>org.apache.tika:tika-office-module</include> + <include>org.apache.tika:tika-package-module</include> + <include>org.apache.tika:tika-pdf-module</include> + <include>org.apache.tika:tika-scientific-module</include> + <include>org.apache.tika:tika-text-module</include> + <include>org.apache.tika:tika-web-module</include> </includes> </artifactSet> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer"> + <resource>META-INF/services/org.apache.tika.detect.Detector</resource> + </transformer> + <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer"> + <resource>META-INF/services/org.apache.tika.detect.EncodingDetector</resource> + </transformer> + <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer"> + <resource>META-INF/services/org.apache.tika.parser.Parser</resource> + </transformer> + </transformers> </configuration> </execution> </executions>
Added: tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageTest.java (added) +++ tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,335 @@ +package org.apache.tika.parser.pkg; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Before; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class PackageTest extends TikaTest { + + private static final MediaType TYPE_7ZIP = MediaType.application("x-7z-compressed"); + + private ParseContext recursingContext; + private Parser autoDetectParser; + + @Before + public void setUp() throws Exception { + + autoDetectParser = new AutoDetectParser(); + recursingContext = new ParseContext(); + recursingContext.set(Parser.class, autoDetectParser); + } + + @Test + public void testZlibParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/testTXT.zlib")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("Test d'indexation de Txt", content); + assertContains("http://www.apache.org", content); + } + + + @Test + public void testArParsing() throws Exception { + Parser parser = new AutoDetectParser(); + + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/testARofText.ar")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-archive", + metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("http://www.apache.org", content); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/testARofSND.ar")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-archive", + metadata.get(Metadata.CONTENT_TYPE)); + content = handler.toString(); + assertContains("testAU.au", content); + } + + @Test + public void testBzip2Parsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.tbz2")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("test-documents/testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("test-documents/testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + + @Test + public void testCompressParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.tar.Z"); + try { + parser.parse(stream, handler, metadata, recursingContext); + } finally { + stream.close(); + } + + assertEquals("application/x-compress", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("test-documents/testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("test-documents/testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + + @Test + public void testGzipParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.tgz")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("test-documents/testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("test-documents/testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + + @Test + public void testRarParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.rar")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("test-documents/testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("test-documents/testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + + @Test + public void test7ZParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + // Ensure 7zip is a parsable format + assertTrue("No 7zip parser found", + parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP)); + + // Parse + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.7z")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("test-documents/testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("test-documents/testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + @Test + public void testTarParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.tar")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("test-documents/testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("test-documents/testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + + @Test + public void testZipParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.zip")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + + @Test + public void testSvgzParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/testSVG.svgz")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("Test SVG image", content); + } +}
