Author: nick
Date: Wed Mar 11 16:49:18 2015
New Revision: 1665940
URL: http://svn.apache.org/r1665940
Log:
TIKA-1286 Bring the overall file mime types into line with the other OOXML
formats, and add container aware detection + tests for the visio ooxml types
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1665940&r1=1665939&r2=1665940&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
(original)
+++
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Wed Mar 11 16:49:18 2015
@@ -2389,32 +2389,32 @@
<sub-class-of type="application/x-tika-msoffice"/>
</mime-type>
- <mime-type type="application/vnd.ms-visio.drawing.main+xml">
+ <mime-type type="application/vnd.ms-visio.drawing">
<_comment>Office Open XML Visio Drawing (macro-free)</_comment>
<glob pattern="*.vsdx"/>
<sub-class-of type="application/x-tika-visio-ooxml"/>
</mime-type>
- <mime-type type="application/vnd.ms-visio.template.main+xml">
+ <mime-type type="application/vnd.ms-visio.template">
<_comment>Office Open XML Visio Template (macro-free)</_comment>
<glob pattern="*.vstx"/>
<sub-class-of type="application/x-tika-visio-ooxml"/>
</mime-type>
- <mime-type type="application/vnd.ms-visio.stencil.main+xml">
+ <mime-type type="application/vnd.ms-visio.stencil">
<_comment>Office Open XML Visio Stencil (macro-free)</_comment>
<glob pattern="*.vssx"/>
<sub-class-of type="application/x-tika-visio-ooxml"/>
</mime-type>
- <mime-type type="application/vnd.ms-visio.drawing.macroEnabled.main+xml">
+ <mime-type type="application/vnd.ms-visio.drawing.macroEnabled.12">
<_comment>Office Open XML Visio Drawing (macro-enabled)</_comment>
<glob pattern="*.vsdm"/>
<sub-class-of type="application/x-tika-visio-ooxml"/>
</mime-type>
- <mime-type type="application/vnd.ms-visio.template.macroEnabled.main+xml">
+ <mime-type type="application/vnd.ms-visio.template.macroEnabled.12">
<_comment>Office Open XML Visio Template (macro-enabled)</_comment>
<glob pattern="*.vstm"/>
<sub-class-of type="application/x-tika-visio-ooxml"/>
</mime-type>
- <mime-type type="application/vnd.ms-visio.stencil.macroEnabled.main+xml">
+ <mime-type type="application/vnd.ms-visio.stencil.macroEnabled.12">
<_comment>Office Open XML Visio Stencil (macro-enabled)</_comment>
<glob pattern="*.vssm"/>
<sub-class-of type="application/x-tika-visio-ooxml"/>
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java?rev=1665940&r1=1665939&r2=1665940&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
Wed Mar 11 16:49:18 2015
@@ -58,6 +58,10 @@ import org.apache.tika.parser.iwork.IWor
public class ZipContainerDetector implements Detector {
private static final Pattern MACRO_TEMPLATE_PATTERN =
Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
+ // TODO Remove this constant once we upgrade to POI 3.12 beta 2, it is
defined in ExtractorFactory there
+ private static final String VISIO_DOCUMENT_REL =
+ "http://schemas.microsoft.com/visio/2010/relationships/document";
+
/** Serial version UID */
private static final long serialVersionUID = 2891763938430295453L;
@@ -231,8 +235,15 @@ public class ZipContainerDetector implem
* opened Package
*/
public static MediaType detectOfficeOpenXML(OPCPackage pkg) {
+ // Check for the normal Office core document
PackageRelationshipCollection core =
pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL);
+ // Otherwise check for some other Office core document types
+ if (core.size() == 0) {
+ core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
+ }
+
+ // If we didn't find a single core document of any type, skip detection
if (core.size() != 1) {
// Invalid OOXML Package received
return null;
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1665940&r1=1665939&r2=1665940&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Wed Mar 11 16:49:18 2015
@@ -215,15 +215,12 @@ public class TestContainerAwareDetector
assertTypeByData("testDOTM.dotm",
"application/vnd.ms-word.template.macroEnabled.12");
assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument");
- // TODO Support detecting the Visio OOXML files
-/*
- assertTypeByData("testVISIO.vsdm",
"application/vnd.ms-visio.drawing.macroenabled.main+xml");
- assertTypeByData("testVISIO.vsdx",
"application/vnd.ms-visio.drawing.main+xml");
- assertTypeByData("testVISIO.vssm",
"application/vnd.ms-visio.stencil.macroenabled.main+xml");
- assertTypeByData("testVISIO.vssx",
"application/vnd.ms-visio.stencil.main+xml");
- assertTypeByData("testVISIO.vstm",
"application/vnd.ms-visio.template.macroenabled.main+xml");
- assertTypeByData("testVISIO.vstx",
"application/vnd.ms-visio.template.main+xml");
-*/
+ assertTypeByData("testVISIO.vsdm",
"application/vnd.ms-visio.drawing.macroenabled.12");
+ assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing");
+ assertTypeByData("testVISIO.vssm",
"application/vnd.ms-visio.stencil.macroenabled.12");
+ assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil");
+ assertTypeByData("testVISIO.vstm",
"application/vnd.ms-visio.template.macroenabled.12");
+ assertTypeByData("testVISIO.vstx",
"application/vnd.ms-visio.template");
// .xlsb is an OOXML file containing the binary parts, and not
// an OLE2 file as you might initially expect!
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1665940&r1=1665939&r2=1665940&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Wed Mar 11 16:49:18 2015
@@ -291,21 +291,21 @@ public class TestMimeTypes {
public void testVisioDetection() throws Exception {
// By Name, should get it right
assertTypeByName("application/vnd.visio", "testVISIO.vsd");
-
assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.main+xml",
"testVISIO.vsdm");
- assertTypeByName("application/vnd.ms-visio.drawing.main+xml",
"testVISIO.vsdx");
-
assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.main+xml",
"testVISIO.vssm");
- assertTypeByName("application/vnd.ms-visio.stencil.main+xml",
"testVISIO.vssx");
-
assertTypeByName("application/vnd.ms-visio.template.macroenabled.main+xml",
"testVISIO.vstm");
- assertTypeByName("application/vnd.ms-visio.template.main+xml",
"testVISIO.vstx");
+ assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.12",
"testVISIO.vsdm");
+ assertTypeByName("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
+ assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.12",
"testVISIO.vssm");
+ assertTypeByName("application/vnd.ms-visio.stencil", "testVISIO.vssx");
+ assertTypeByName("application/vnd.ms-visio.template.macroenabled.12",
"testVISIO.vstm");
+ assertTypeByName("application/vnd.ms-visio.template",
"testVISIO.vstx");
// By Name and Data, should get it right
assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd");
-
assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.main+xml",
"testVISIO.vsdm");
- assertTypeByNameAndData("application/vnd.ms-visio.drawing.main+xml",
"testVISIO.vsdx");
-
assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.main+xml",
"testVISIO.vssm");
- assertTypeByNameAndData("application/vnd.ms-visio.stencil.main+xml",
"testVISIO.vssx");
-
assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.main+xml",
"testVISIO.vstm");
- assertTypeByNameAndData("application/vnd.ms-visio.template.main+xml",
"testVISIO.vstx");
+
assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.12",
"testVISIO.vsdm");
+ assertTypeByNameAndData("application/vnd.ms-visio.drawing",
"testVISIO.vsdx");
+
assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.12",
"testVISIO.vssm");
+ assertTypeByNameAndData("application/vnd.ms-visio.stencil",
"testVISIO.vssx");
+
assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.12",
"testVISIO.vstm");
+ assertTypeByNameAndData("application/vnd.ms-visio.template",
"testVISIO.vstx");
// By Data only, will get the container parent
assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd");