This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit be6e95d45bdfc40f35e93b26e45533d0a78ebd48
Author: tballison <talli...@mitre.org>
AuthorDate: Tue Mar 6 15:50:06 2018 -0500

    TIKA-2576 -- Upgrade commons compress and add detection and parsing of zstd 
(if user provides com.github.luben:zstd-jni... via Andreas Meier
---
 CHANGES.txt                                               |   3 +++
 .../resources/org/apache/tika/mime/tika-mimetypes.xml     |   9 ++++++++-
 tika-parent/pom.xml                                       |   2 +-
 tika-parsers/pom.xml                                      |   8 +++++++-
 .../java/org/apache/tika/parser/pkg/CompressorParser.java |   8 +++++++-
 .../src/test/java/org/apache/tika/mime/TestMimeTypes.java |   2 ++
 .../org/apache/tika/parser/pkg/CompressorParserTest.java  |   7 +++++++
 .../src/test/resources/test-documents/testZSTD.zstd       | Bin 0 -> 143 bytes
 8 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 7b78929..d553961 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.18 - ???
 
+   * Add detection and parsing of zstd (if user provides
+     com.github.luben:zstd-jni) via Andreas Meier (TIKA-2576)
+
    * Allow for RFC822 detection for files starting with "dkim-"
      and/or "x-" via Andreas Meier (TIKA-2578 and TIKA-2587)
 
diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 7432a56..f6a8844 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3453,7 +3453,14 @@
     <glob pattern="*.tgz" />
     <glob pattern="*-gz" />
   </mime-type>
-
+  <mime-type type="application/zstd">
+    <_comment>https://en.wikipedia.org/wiki/Zstandard</_comment>
+    
<_comment>https://tools.ietf.org/id/draft-kucherawy-dispatch-zstd-01.html</_comment>
+    <magic priority="50">
+      <match value="0xFD2FB528" type="little32" offset="0"/>
+    </magic>
+    <glob pattern="*.zstd"/>
+  </mime-type>
   <mime-type type="application/x-hdf">
     <_comment>Hierarchical Data Format File</_comment>
     <magic priority="50">
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 2cbcf4e..03c8ea0 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -306,7 +306,7 @@
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     
<project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
-    <commons.compress.version>1.14</commons.compress.version>
+    <commons.compress.version>1.16.1</commons.compress.version>
     <commons.io.version>2.6</commons.io.version>
     <gson.version>2.8.1</gson.version>
     <cxf.version>3.0.16</cxf.version>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 271ec07..35787cb 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -39,7 +39,7 @@
     <!-- NOTE: sync codec version with POI -->
     <codec.version>1.10</codec.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
-    <tukaani.version>1.6</tukaani.version>
+    <tukaani.version>1.8</tukaani.version>
     <mime4j.version>0.8.1</mime4j.version>
     <vorbis.version>0.8</vorbis.version>
     <pdfbox.version>2.0.8</pdfbox.version>
@@ -150,6 +150,12 @@
       <artifactId>xz</artifactId>
       <version>${tukaani.version}</version>
     </dependency>
+    <dependency>
+      <groupId>com.github.luben</groupId>
+      <artifactId>zstd-jni</artifactId>
+      <version>1.3.3-3</version>
+      <scope>provided</scope>
+    </dependency>
 
     <dependency>
       <groupId>commons-codec</groupId>
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 48f8bec..ada7ec9 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -75,10 +75,12 @@ public class CompressorParser extends AbstractParser {
     private static final MediaType ZLIB = MediaType.application("zlib");
     private static final MediaType LZMA = MediaType.application("x-lzma");
     private static final MediaType LZ4_FRAMED = MediaType.application("x-lz4");
+    private static final MediaType ZSTD = MediaType.application("zstd");
+    private static final MediaType DEFLATE64= 
MediaType.application("deflate64");
 
     private static final Set<MediaType> SUPPORTED_TYPES =
             MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
-                    XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA);
+                    XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA, ZSTD);
 
     private int memoryLimitInKb = 100000;//100MB
 
@@ -141,6 +143,10 @@ public class CompressorParser extends AbstractParser {
             return SNAPPY_RAW;
         } else if (CompressorStreamFactory.LZMA.equals(name)) {
             return LZMA;
+        } else if (CompressorStreamFactory.ZSTANDARD.equals(name)) {
+            return ZSTD;
+        } else if (CompressorStreamFactory.DEFLATE64.equals(name)) {
+            return DEFLATE64;
         } else {
             return MediaType.OCTET_STREAM;
         }
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index e76a7d5..bbb25e5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -379,6 +379,8 @@ public class TestMimeTypes {
        // For spanned zip files, the .zip file doesn't have the header, it's 
the other parts
        assertTypeByData("application/octet-stream", 
"test-documents-spanned.zip");
        assertTypeByData("application/zip",          
"test-documents-spanned.z01");
+
+       assertTypeDetection("testZSTD.zstd", "application/zstd");
     }
     
     @Test
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
index 444afc7..26552eb 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
@@ -42,6 +42,7 @@ public class CompressorParserTest extends TikaTest {
         NOT_COVERED.add(MediaType.application("x-brotli"));
         NOT_COVERED.add(MediaType.application("x-lz4-block"));
         NOT_COVERED.add(MediaType.application("x-snappy-raw"));
+        NOT_COVERED.add(MediaType.application("deflate64"));
     }
 
     @Test
@@ -61,6 +62,12 @@ public class CompressorParserTest extends TikaTest {
     }
 
     @Test
+    public void testZstd() throws Exception {
+        XMLResult r = getXML("testZSTD.zstd");
+        assertContains("0123456789", r.xml);
+    }
+
+    @Test
     public void testCoverage() throws Exception {
         //test that the package parser covers all inputstreams handled
         //by CompressorStreamFactory.  When we update commons-compress, and 
they add
diff --git a/tika-parsers/src/test/resources/test-documents/testZSTD.zstd 
b/tika-parsers/src/test/resources/test-documents/testZSTD.zstd
new file mode 100644
index 0000000..f594f1a
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testZSTD.zstd differ

-- 
To stop receiving notification emails like this one, please contact
talli...@apache.org.

Reply via email to