Repository: tika Updated Branches: refs/heads/master d6981ad81 -> 52ea9ba7c
Detection magic for POI-generated OOXML files, which have _rels before content type, plus test Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/52ea9ba7 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/52ea9ba7 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/52ea9ba7 Branch: refs/heads/master Commit: 52ea9ba7c2e3c99e7a2d4fb38875caa996438857 Parents: d6981ad Author: Nick Burch <[email protected]> Authored: Thu Jun 23 14:27:14 2016 +0100 Committer: Nick Burch <[email protected]> Committed: Thu Jun 23 14:27:14 2016 +0100 ---------------------------------------------------------------------- .../org/apache/tika/mime/tika-mimetypes.xml | 3 ++- .../java/org/apache/tika/mime/TestMimeTypes.java | 5 +++++ .../resources/test-documents/testEXCEL_poi.xlsx | Bin 0 -> 3360 bytes 3 files changed, 7 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/52ea9ba7/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index a94f188..b39f529 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -3989,10 +3989,11 @@ <!-- =================================================================== --> <mime-type type="application/x-tika-ooxml"> <sub-class-of type="application/zip"/> - <!-- Only works if the Content Types file is the first zip entry --> + <!-- Only works if the Content Types or rels file is the first zip entry --> <magic priority="50"> <match value="PK\003\004" type="string" offset="0"> <match value="[Content_Types].xml" type="string" offset="30"/> + <match value="_rels/.rels" type="string" offset="30"/> </match> </magic> </mime-type> http://git-wip-us.apache.org/repos/asf/tika/blob/52ea9ba7/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 102b005..81b154c 100644 --- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -283,12 +283,17 @@ public class TestMimeTypes { // As such, our mime magic can't figure it out... assertTypeByData("application/zip", "testWORD.docx"); + // POI-generated files have the rels first not Content Types + assertTypeByData("application/x-tika-ooxml", "testEXCEL_poi.xlsx"); + // If we give the filename as well as the data, we can // specialise the ooxml generic one to the correct type assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL.xlsx"); assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation", "testPPT.pptx"); assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "testWORD.docx"); + assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL_poi.xlsx"); + // Test a few of the less usual ones assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb"); assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12", "testPPT.pptm"); http://git-wip-us.apache.org/repos/asf/tika/blob/52ea9ba7/tika-parsers/src/test/resources/test-documents/testEXCEL_poi.xlsx ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_poi.xlsx b/tika-parsers/src/test/resources/test-documents/testEXCEL_poi.xlsx new file mode 100644 index 0000000..713fb2e Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_poi.xlsx differ
