This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 3701f2d TIKA-2576 -- Upgrade commons compress and add detection and
parsing of zstd (if user provides com.github.luben:zstd-jni... via Andreas Meier
3701f2d is described below
commit 3701f2d340ee56af10aa1b6cc44375d71b50bb52
Author: tballison <[email protected]>
AuthorDate: Tue Mar 6 15:50:06 2018 -0500
TIKA-2576 -- Upgrade commons compress and add detection and parsing of zstd
(if user provides com.github.luben:zstd-jni... via Andreas Meier
---
CHANGES.txt | 3 +++
.../resources/org/apache/tika/mime/tika-mimetypes.xml | 9 ++++++++-
tika-parent/pom.xml | 2 +-
tika-parsers/pom.xml | 8 +++++++-
.../java/org/apache/tika/parser/pkg/CompressorParser.java | 8 +++++++-
.../src/test/java/org/apache/tika/mime/TestMimeTypes.java | 2 ++
.../org/apache/tika/parser/pkg/CompressorParserTest.java | 7 +++++++
.../src/test/resources/test-documents/testZSTD.zstd | Bin 0 -> 143 bytes
8 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 47c2864..10ede92 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,6 +5,9 @@ Release 2.0.0 - ???
Other changes
+ * Add detection and parsing of zstd (if user provides
+ com.github.luben:zstd-jni) via Andreas Meier (TIKA-2576)
+
* Allow for RFC822 detection for files starting with "dkim-"
and/or "x-" via Andreas Meier (TIKA-2578 and TIKA-2587)
diff --git
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 857868b..e8da795 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3478,7 +3478,14 @@
<glob pattern="*.tgz" />
<glob pattern="*-gz" />
</mime-type>
-
+ <mime-type type="application/zstd">
+ <_comment>https://en.wikipedia.org/wiki/Zstandard</_comment>
+
<_comment>https://tools.ietf.org/id/draft-kucherawy-dispatch-zstd-01.html</_comment>
+ <magic priority="50">
+ <match value="0xFD2FB528" type="little32" offset="0"/>
+ </magic>
+ <glob pattern="*.zstd"/>
+ </mime-type>
<mime-type type="application/x-hdf">
<_comment>Hierarchical Data Format File</_comment>
<magic priority="50">
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 8a0ceec..c047af3 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -306,7 +306,7 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
<!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
- <commons.compress.version>1.14</commons.compress.version>
+ <commons.compress.version>1.16.1</commons.compress.version>
<commons.io.version>2.6</commons.io.version>
<gson.version>2.8.1</gson.version>
<cxf.version>3.0.16</cxf.version>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 4ce29bb..35f4078 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -39,7 +39,7 @@
<!-- NOTE: sync codec version with POI -->
<codec.version>1.10</codec.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
- <tukaani.version>1.6</tukaani.version>
+ <tukaani.version>1.8</tukaani.version>
<mime4j.version>0.8.1</mime4j.version>
<vorbis.version>0.8</vorbis.version>
<pdfbox.version>2.0.8</pdfbox.version>
@@ -150,6 +150,12 @@
<artifactId>xz</artifactId>
<version>${tukaani.version}</version>
</dependency>
+ <dependency>
+ <groupId>com.github.luben</groupId>
+ <artifactId>zstd-jni</artifactId>
+ <version>1.3.3-3</version>
+ <scope>provided</scope>
+ </dependency>
<dependency>
<groupId>commons-codec</groupId>
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 3ffaf66..159f78c 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -76,10 +76,12 @@ public class CompressorParser extends AbstractParser {
private static final MediaType ZLIB = MediaType.application("zlib");
private static final MediaType LZMA = MediaType.application("x-lzma");
private static final MediaType LZ4_FRAMED = MediaType.application("x-lz4");
+ private static final MediaType ZSTD = MediaType.application("zstd");
+ private static final MediaType DEFLATE64=
MediaType.application("deflate64");
private static final Set<MediaType> SUPPORTED_TYPES =
MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
- XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA);
+ XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA, ZSTD);
private int memoryLimitInKb = 100000;//100MB
@@ -142,6 +144,10 @@ public class CompressorParser extends AbstractParser {
return SNAPPY_RAW;
} else if (CompressorStreamFactory.LZMA.equals(name)) {
return LZMA;
+ } else if (CompressorStreamFactory.ZSTANDARD.equals(name)) {
+ return ZSTD;
+ } else if (CompressorStreamFactory.DEFLATE64.equals(name)) {
+ return DEFLATE64;
} else {
return MediaType.OCTET_STREAM;
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 4c63f21..835a525 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -380,6 +380,8 @@ public class TestMimeTypes {
// For spanned zip files, the .zip file doesn't have the header, it's
the other parts
assertTypeByData("application/octet-stream",
"test-documents-spanned.zip");
assertTypeByData("application/zip",
"test-documents-spanned.z01");
+
+ assertTypeDetection("testZSTD.zstd", "application/zstd");
}
@Test
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
index 444afc7..26552eb 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
@@ -42,6 +42,7 @@ public class CompressorParserTest extends TikaTest {
NOT_COVERED.add(MediaType.application("x-brotli"));
NOT_COVERED.add(MediaType.application("x-lz4-block"));
NOT_COVERED.add(MediaType.application("x-snappy-raw"));
+ NOT_COVERED.add(MediaType.application("deflate64"));
}
@Test
@@ -61,6 +62,12 @@ public class CompressorParserTest extends TikaTest {
}
@Test
+ public void testZstd() throws Exception {
+ XMLResult r = getXML("testZSTD.zstd");
+ assertContains("0123456789", r.xml);
+ }
+
+ @Test
public void testCoverage() throws Exception {
//test that the package parser covers all inputstreams handled
//by CompressorStreamFactory. When we update commons-compress, and
they add
diff --git a/tika-parsers/src/test/resources/test-documents/testZSTD.zstd
b/tika-parsers/src/test/resources/test-documents/testZSTD.zstd
new file mode 100644
index 0000000..f594f1a
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testZSTD.zstd differ
--
To stop receiving notification emails like this one, please contact
[email protected].