This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit be6e95d45bdfc40f35e93b26e45533d0a78ebd48 Author: tballison <talli...@mitre.org> AuthorDate: Tue Mar 6 15:50:06 2018 -0500 TIKA-2576 -- Upgrade commons compress and add detection and parsing of zstd (if user provides com.github.luben:zstd-jni... via Andreas Meier --- CHANGES.txt | 3 +++ .../resources/org/apache/tika/mime/tika-mimetypes.xml | 9 ++++++++- tika-parent/pom.xml | 2 +- tika-parsers/pom.xml | 8 +++++++- .../java/org/apache/tika/parser/pkg/CompressorParser.java | 8 +++++++- .../src/test/java/org/apache/tika/mime/TestMimeTypes.java | 2 ++ .../org/apache/tika/parser/pkg/CompressorParserTest.java | 7 +++++++ .../src/test/resources/test-documents/testZSTD.zstd | Bin 0 -> 143 bytes 8 files changed, 35 insertions(+), 4 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 7b78929..d553961 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,8 @@ Release 1.18 - ??? + * Add detection and parsing of zstd (if user provides + com.github.luben:zstd-jni) via Andreas Meier (TIKA-2576) + * Allow for RFC822 detection for files starting with "dkim-" and/or "x-" via Andreas Meier (TIKA-2578 and TIKA-2587) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 7432a56..f6a8844 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -3453,7 +3453,14 @@ <glob pattern="*.tgz" /> <glob pattern="*-gz" /> </mime-type> - + <mime-type type="application/zstd"> + <_comment>https://en.wikipedia.org/wiki/Zstandard</_comment> + <_comment>https://tools.ietf.org/id/draft-kucherawy-dispatch-zstd-01.html</_comment> + <magic priority="50"> + <match value="0xFD2FB528" type="little32" offset="0"/> + </magic> + <glob pattern="*.zstd"/> + </mime-type> <mime-type type="application/x-hdf"> <_comment>Hierarchical Data Format File</_comment> <magic priority="50"> diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 2cbcf4e..03c8ea0 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -306,7 +306,7 @@ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding> <!-- NOTE: sync tukaani version with commons-compress in tika-parsers --> - <commons.compress.version>1.14</commons.compress.version> + <commons.compress.version>1.16.1</commons.compress.version> <commons.io.version>2.6</commons.io.version> <gson.version>2.8.1</gson.version> <cxf.version>3.0.16</cxf.version> diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index 271ec07..35787cb 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -39,7 +39,7 @@ <!-- NOTE: sync codec version with POI --> <codec.version>1.10</codec.version> <!-- NOTE: sync tukaani version with commons-compress in tika-parent--> - <tukaani.version>1.6</tukaani.version> + <tukaani.version>1.8</tukaani.version> <mime4j.version>0.8.1</mime4j.version> <vorbis.version>0.8</vorbis.version> <pdfbox.version>2.0.8</pdfbox.version> @@ -150,6 +150,12 @@ <artifactId>xz</artifactId> <version>${tukaani.version}</version> </dependency> + <dependency> + <groupId>com.github.luben</groupId> + <artifactId>zstd-jni</artifactId> + <version>1.3.3-3</version> + <scope>provided</scope> + </dependency> <dependency> <groupId>commons-codec</groupId> diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index 48f8bec..ada7ec9 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@ -75,10 +75,12 @@ public class CompressorParser extends AbstractParser { private static final MediaType ZLIB = MediaType.application("zlib"); private static final MediaType LZMA = MediaType.application("x-lzma"); private static final MediaType LZ4_FRAMED = MediaType.application("x-lz4"); + private static final MediaType ZSTD = MediaType.application("zstd"); + private static final MediaType DEFLATE64= MediaType.application("deflate64"); private static final Set<MediaType> SUPPORTED_TYPES = MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS, - XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA); + XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA, ZSTD); private int memoryLimitInKb = 100000;//100MB @@ -141,6 +143,10 @@ public class CompressorParser extends AbstractParser { return SNAPPY_RAW; } else if (CompressorStreamFactory.LZMA.equals(name)) { return LZMA; + } else if (CompressorStreamFactory.ZSTANDARD.equals(name)) { + return ZSTD; + } else if (CompressorStreamFactory.DEFLATE64.equals(name)) { + return DEFLATE64; } else { return MediaType.OCTET_STREAM; } diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java index e76a7d5..bbb25e5 100644 --- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -379,6 +379,8 @@ public class TestMimeTypes { // For spanned zip files, the .zip file doesn't have the header, it's the other parts assertTypeByData("application/octet-stream", "test-documents-spanned.zip"); assertTypeByData("application/zip", "test-documents-spanned.z01"); + + assertTypeDetection("testZSTD.zstd", "application/zstd"); } @Test diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java index 444afc7..26552eb 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java @@ -42,6 +42,7 @@ public class CompressorParserTest extends TikaTest { NOT_COVERED.add(MediaType.application("x-brotli")); NOT_COVERED.add(MediaType.application("x-lz4-block")); NOT_COVERED.add(MediaType.application("x-snappy-raw")); + NOT_COVERED.add(MediaType.application("deflate64")); } @Test @@ -61,6 +62,12 @@ public class CompressorParserTest extends TikaTest { } @Test + public void testZstd() throws Exception { + XMLResult r = getXML("testZSTD.zstd"); + assertContains("0123456789", r.xml); + } + + @Test public void testCoverage() throws Exception { //test that the package parser covers all inputstreams handled //by CompressorStreamFactory. When we update commons-compress, and they add diff --git a/tika-parsers/src/test/resources/test-documents/testZSTD.zstd b/tika-parsers/src/test/resources/test-documents/testZSTD.zstd new file mode 100644 index 0000000..f594f1a Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testZSTD.zstd differ -- To stop receiving notification emails like this one, please contact talli...@apache.org.