Repository: tika
Updated Branches:
  refs/heads/master d6981ad81 -> 52ea9ba7c


Detection magic for POI-generated OOXML files, which have _rels before content 
type, plus test


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/52ea9ba7
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/52ea9ba7
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/52ea9ba7

Branch: refs/heads/master
Commit: 52ea9ba7c2e3c99e7a2d4fb38875caa996438857
Parents: d6981ad
Author: Nick Burch <[email protected]>
Authored: Thu Jun 23 14:27:14 2016 +0100
Committer: Nick Burch <[email protected]>
Committed: Thu Jun 23 14:27:14 2016 +0100

----------------------------------------------------------------------
 .../org/apache/tika/mime/tika-mimetypes.xml         |   3 ++-
 .../java/org/apache/tika/mime/TestMimeTypes.java    |   5 +++++
 .../resources/test-documents/testEXCEL_poi.xlsx     | Bin 0 -> 3360 bytes
 3 files changed, 7 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/52ea9ba7/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index a94f188..b39f529 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3989,10 +3989,11 @@
   <!-- =================================================================== -->
   <mime-type type="application/x-tika-ooxml">
     <sub-class-of type="application/zip"/>
-    <!-- Only works if the Content Types file is the first zip entry -->
+    <!-- Only works if the Content Types or rels file is the first zip entry 
-->
     <magic priority="50">
       <match value="PK\003\004" type="string" offset="0">
         <match value="[Content_Types].xml" type="string" offset="30"/>
+        <match value="_rels/.rels" type="string" offset="30"/>
       </match>
     </magic>
   </mime-type>

http://git-wip-us.apache.org/repos/asf/tika/blob/52ea9ba7/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 102b005..81b154c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -283,12 +283,17 @@ public class TestMimeTypes {
         // As such, our mime magic can't figure it out...
         assertTypeByData("application/zip", "testWORD.docx");
         
+        // POI-generated files have the rels first not Content Types
+        assertTypeByData("application/x-tika-ooxml", "testEXCEL_poi.xlsx");
+        
         // If we give the filename as well as the data, we can
         //  specialise the ooxml generic one to the correct type
         
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 "testEXCEL.xlsx");
         
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation",
 "testPPT.pptx");
         
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 "testWORD.docx");
         
+        
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 "testEXCEL_poi.xlsx");
+        
         // Test a few of the less usual ones
         
assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb");
         
assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12",
 "testPPT.pptm");

http://git-wip-us.apache.org/repos/asf/tika/blob/52ea9ba7/tika-parsers/src/test/resources/test-documents/testEXCEL_poi.xlsx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_poi.xlsx 
b/tika-parsers/src/test/resources/test-documents/testEXCEL_poi.xlsx
new file mode 100644
index 0000000..713fb2e
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testEXCEL_poi.xlsx differ

Reply via email to