TIKA-1890 Mime magic for CAB files, and unit tests for detection
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/f7d3097f Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/f7d3097f Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/f7d3097f Branch: refs/heads/master Commit: f7d3097fb6581d989195b51bb2bc4302ad9bf24a Parents: b878281 Author: Nick Burch <[email protected]> Authored: Sun Mar 6 14:33:54 2016 +0000 Committer: Nick Burch <[email protected]> Committed: Sun Mar 6 14:33:54 2016 +0000 ---------------------------------------------------------------------- .../src/main/resources/org/apache/tika/mime/tika-mimetypes.xml | 3 +++ .../src/test/java/org/apache/tika/mime/TestMimeTypes.java | 2 ++ 2 files changed, 5 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/f7d3097f/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index bcf4fee..a4e0588 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -1517,6 +1517,9 @@ </mime-type> <mime-type type="application/vnd.ms-asf"/> <mime-type type="application/vnd.ms-cab-compressed"> + <magic priority="50"> + <match value="MSCF\000\000\000\000" type="string" offset="0"/> + </magic> <glob pattern="*.cab"/> </mime-type> http://git-wip-us.apache.org/repos/asf/tika/blob/f7d3097f/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 67a749e..57198ad 100644 --- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -353,6 +353,7 @@ public class TestMimeTypes { assertTypeByName("application/x-tar", "test.tar"); assertTypeByName("application/gzip", "test.tgz"); // See GZIP, not tar contents of it assertTypeByName("application/x-cpio", "test.cpio"); + assertTypeByName("application/vnd.ms-cab-compressed", "test.cab"); // TODO Add an example .deb and .udeb, then check these @@ -363,6 +364,7 @@ public class TestMimeTypes { assertTypeByData("application/x-gtar", "test-documents.tar"); // GNU TAR assertTypeByData("application/gzip", "test-documents.tgz"); // See GZIP, not tar contents of it assertTypeByData("application/x-cpio", "test-documents.cpio"); + assertTypeByData("application/vnd.ms-cab-compressed", "test-documents.cab"); // For spanned zip files, the .zip file doesn't have the header, it's the other parts assertTypeByData("application/octet-stream", "test-documents-spanned.zip");
