Author: jukka Date: Sun Jun 28 17:05:49 2009 New Revision: 789125 URL: http://svn.apache.org/viewvc?rev=789125&view=rev Log: TIKA-253: Better mime type for ooxml files
Updated office media type settings based on the table at http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx Modified: lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Modified: lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=789125&r1=789124&r2=789125&view=diff ============================================================================== --- lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Sun Jun 28 17:05:49 2009 @@ -155,6 +155,10 @@ <root-XML namespaceURI="http://www.w3.org/1999/xhtml" localName="html" /> </mime-type> + <!-- ===================================================================== --> + <!-- Microsoft Office binary file formats --> + <!-- http://www.microsoft.com/interop/docs/OfficeBinaryFormats.mspx --> + <!-- ===================================================================== --> <mime-type type="application/x-tika-msoffice"> <magic> @@ -164,20 +168,49 @@ <!-- http://www.iana.org/assignments/media-types/application/vnd.visio --> <mime-type type="application/vnd.visio"> + <comment>Microsoft Visio Diagram</comment> <glob pattern="*.vsd" /> <glob pattern="*.vst" /> <glob pattern="*.vsw" /> <glob pattern="*.vss" /> + <sub-class-of type="application/x-tika-msoffice"/> </mime-type> + <!-- http://www.iana.org/assignments/media-types/application/vnd.ms-powerpoint --> <mime-type type="application/vnd.ms-powerpoint"> + <comment>Microsoft Powerpoint Presentation</comment> <glob pattern="*.ppz" /> <glob pattern="*.ppt" /> <glob pattern="*.pps" /> <glob pattern="*.pot" /> + <glob pattern="*.ppa" /> + <alias type="application/mspowerpoint" /> + <sub-class-of type="application/x-tika-msoffice"/> + </mime-type> + + <mime-type type="application/vnd.ms-powerpoint.addin.macroenabled.12"> + <glob pattern="*.ppam"/> + <sub-class-of type="application/x-tika-msoffice"/> + </mime-type> + + <mime-type type="application/vnd.ms-powerpoint.presentation.macroenabled.12"> + <glob pattern="*.pptm"/> + <sub-class-of type="application/x-tika-msoffice"/> + </mime-type> + + <mime-type type="application/vnd.ms-powerpoint.presentation.macroenabled.12"> + <glob pattern="*.potm"/> + <sub-class-of type="application/x-tika-msoffice"/> + </mime-type> + + <mime-type type="application/vnd.ms-powerpoint.slideshow.macroenabled.12"> + <glob pattern="*.ppsm"/> + <sub-class-of type="application/x-tika-msoffice"/> </mime-type> + <!-- http://www.iana.org/assignments/media-types/application/vnd.ms-excel --> <mime-type type="application/vnd.ms-excel"> + <comment>Microsoft Excel Spreadsheet</comment> <magic priority="50"> <match value="Microsoft\ Excel\ 5.0\ Worksheet" type="string" offset="2080" /> <match value="Foglio\ di\ lavoro\ Microsoft\ Exce" type="string" offset="2080" /> @@ -194,17 +227,116 @@ <glob pattern="*.xlt" /> <glob pattern="*.xld" /> <alias type="application/msexcel" /> + <sub-class-of type="application/x-tika-msoffice"/> + </mime-type> + + <mime-type type="application/vnd.ms-excel.sheet.macroenabled.12"> + <glob pattern="*.xlsm"/> + <sub-class-of type="application/x-tika-msoffice"/> + </mime-type> + + <mime-type type="application/vnd.ms-excel.template.macroenabled.12"> + <glob pattern="*.xltm"/> + <sub-class-of type="application/x-tika-msoffice"/> + </mime-type> + + <mime-type type="application/vnd.ms-excel.addin.macroenabled.12"> + <glob pattern="*.xlam"/> + <sub-class-of type="application/x-tika-msoffice"/> + </mime-type> + + <mime-type type="application/vnd.ms-excel.sheet.binary.macroenabled.12"> + <glob pattern="*.xlsb"/> + <sub-class-of type="application/x-tika-msoffice"/> + </mime-type> + + <!-- http://www.iana.org/assignments/media-types/application/msword --> + <mime-type type="application/msword"> + <comment>Microsoft Word Document</comment> + <magic priority="50"> + <match value="Microsoft\ Word\ 6.0\ Document" type="string" offset="2080" /> + <match value="Documento\ Microsoft\ Word\ 6" type="string" offset="2080" /> + <match value="MSWordDoc" type="string" offset="2112" /> + <match value="0x31be0000" type="big32" offset="0" /> + <match value="PO^Q`" type="string" offset="0" /> + <match value="\376\067\0\043" type="string" offset="0" /> + <match value="\333\245-\0\0\0" type="string" offset="0" /> + <match value="\354\245\301" type="string" offset="512" /> + <match value="\320\317\021\340\241\261\032\341" type="string" offset="0" /> + <match value="\224\246\056" type="string" offset="0" /> + <match value="R\0o\0o\0t\0\ \0E\0n\0t\0r\0y" type="string" offset="512" /> + </magic> + <glob pattern="*.doc" /> + <glob pattern="*.dot" /> + <alias type="application/vnd.ms-word" /> + <sub-class-of type="application/x-tika-msoffice"/> + </mime-type> + + <mime-type type="application/vnd.ms-word.document.macroenabled.12"> + <glob pattern="*.docm"/> + <sub-class-of type="application/x-tika-msoffice"/> + </mime-type> + + <mime-type type="application/vnd.ms-word.template.macroenabled.12"> + <glob pattern="*.dotm"/> + <sub-class-of type="application/x-tika-msoffice"/> </mime-type> <mime-type type="application/vnd.ms-outlook"> + <comment>Microsoft Outlook Message</comment> <glob pattern="*.msg" /> + <sub-class-of type="application/x-tika-msoffice"/> </mime-type> - <mime-type type="application/vnd.openxmlformats-package.core-properties+xml"> + <!-- ===================================================================== --> + <!-- Office Open XML file formats --> + <!-- http://www.ecma-international.org/publications/standards/Ecma-376.htm --> + <!-- ===================================================================== --> + + <mime-type type="application/x-tika-ooxml"> <sub-class-of type="application/zip"/> - <glob pattern="*.docx" /> - <glob pattern="*.pptx" /> - <glob pattern="*.xlsx" /> + </mime-type> + + <mime-type type="application/vnd.openxmlformats-officedocument.presentationml.presentation"> + <comment>Office Open XML Presentation</comment> + <glob pattern="*.pptx"/> + <sub-class-of type="application/x-tika-ooxml"/> + </mime-type> + + <mime-type type="application/vnd.openxmlformats-officedocument.presentationml.template"> + <comment>Office Open XML Presentation Template</comment> + <glob pattern="*.potx"/> + <sub-class-of type="application/x-tika-ooxml"/> + </mime-type> + + <mime-type type="application/vnd.openxmlformats-officedocument.presentationml.slideshow"> + <comment>Office Open XML Presentation Slideshow</comment> + <glob pattern="*.ppsx"/> + <sub-class-of type="application/x-tika-ooxml"/> + </mime-type> + + <mime-type type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"> + <comment>Office Open XML Spreadsheet</comment> + <glob pattern="*.xlsx"/> + <sub-class-of type="application/x-tika-ooxml"/> + </mime-type> + + <mime-type type="application/vnd.openxmlformats-officedocument.spreadsheetml.template"> + <comment>Office Open XML Spreadsheet Template</comment> + <glob pattern="*.xltx"/> + <sub-class-of type="application/x-tika-ooxml"/> + </mime-type> + + <mime-type type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"> + <comment>Office Open XML Document</comment> + <glob pattern="*.docx"/> + <sub-class-of type="application/x-tika-ooxml"/> + </mime-type> + + <mime-type type="application/vnd.openxmlformats-officedocument.wordprocessingml.template"> + <comment>Office Open XML Document Template</comment> + <glob pattern="*.dotx"/> + <sub-class-of type="application/x-tika-ooxml"/> </mime-type> <!-- ===================================================================== --> @@ -468,24 +600,6 @@ <glob pattern="*.class" /> </mime-type> - <mime-type type="application/msword"> - <magic priority="50"> - <match value="Microsoft\ Word\ 6.0\ Document" type="string" offset="2080" /> - <match value="Documento\ Microsoft\ Word\ 6" type="string" offset="2080" /> - <match value="MSWordDoc" type="string" offset="2112" /> - <match value="0x31be0000" type="big32" offset="0" /> - <match value="PO^Q`" type="string" offset="0" /> - <match value="\376\067\0\043" type="string" offset="0" /> - <match value="\333\245-\0\0\0" type="string" offset="0" /> - <match value="\354\245\301" type="string" offset="512" /> - <match value="\320\317\021\340\241\261\032\341" type="string" offset="0" /> - <match value="\224\246\056" type="string" offset="0" /> - <match value="R\0o\0o\0t\0\ \0E\0n\0t\0r\0y" type="string" offset="512" /> - </magic> - <glob pattern="*.doc" /> - <alias type="application/vnd.ms-word" /> - </mime-type> - <mime-type type="application/octet-stream"> <magic priority="50"> <match value="#\ This\ is\ a\ shell\ archive" type="string" offset="10" /> Modified: lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml?rev=789125&r1=789124&r2=789125&view=diff ============================================================================== --- lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml (original) +++ lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml Sun Jun 28 17:05:49 2009 @@ -30,18 +30,32 @@ <parser name="parse-office" class="org.apache.tika.parser.microsoft.OfficeParser"> <mime>application/x-tika-msoffice</mime> - <mime>application/msword</mime> - <mime>application/vnd.ms-excel</mime> - <mime>application/vnd.ms-powerpoint</mime> <mime>application/vnd.visio</mime> + <mime>application/vnd.ms-powerpoint</mime> + <mime>application/vnd.ms-powerpoint.addin.macroenabled.12</mime> + <mime>application/vnd.ms-powerpoint.presentation.macroenabled.12</mime> + <mime>application/vnd.ms-powerpoint.presentation.macroenabled.12</mime> + <mime>application/vnd.ms-powerpoint.slideshow.macroenabled.12</mime> + <mime>application/vnd.ms-excel</mime> + <mime>application/vnd.ms-excel.sheet.macroenabled.12</mime> + <mime>application/vnd.ms-excel.template.macroenabled.12</mime> + <mime>application/vnd.ms-excel.addin.macroenabled.12</mime> + <mime>application/vnd.ms-excel.sheet.binary.macroenabled.12</mime> + <mime>application/msword</mime> + <mime>application/vnd.ms-word.document.macroenabled.12</mime> + <mime>application/vnd.ms-word.template.macroenabled.12</mime> <mime>application/vnd.ms-outlook</mime> </parser> <parser name="parse-ooxml" class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> - <mime>application/vnd.openxmlformats-package.core-properties+xml</mime> - <mime>application/vnd.openxmlformats-officedocument.spreadsheetml.sheet</mime> + <mime>application/x-tika-ooxml</mime> <mime>application/vnd.openxmlformats-officedocument.presentationml.presentation</mime> + <mime>application/vnd.openxmlformats-officedocument.presentationml.template</mime> + <mime>application/vnd.openxmlformats-officedocument.presentationml.slideshow</mime> + <mime>application/vnd.openxmlformats-officedocument.spreadsheetml.sheet</mime> + <mime>application/vnd.openxmlformats-officedocument.spreadsheetml.template</mime> <mime>application/vnd.openxmlformats-officedocument.wordprocessingml.document</mime> + <mime>application/vnd.openxmlformats-officedocument.wordprocessingml.template</mime> </parser> <parser name="parse-html" class="org.apache.tika.parser.html.HtmlParser"> Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=789125&r1=789124&r2=789125&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Sun Jun 28 17:05:49 2009 @@ -37,8 +37,11 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { protected POIXMLTextExtractor extractor; - public AbstractOOXMLExtractor(POIXMLTextExtractor extractor) { + private final String type; + + public AbstractOOXMLExtractor(POIXMLTextExtractor extractor, String type) { this.extractor = extractor; + this.type = type; } /** @@ -52,7 +55,7 @@ * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor() */ public MetadataExtractor getMetadataExtractor() { - return new MetadataExtractor(extractor); + return new MetadataExtractor(extractor, type); } /** Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=789125&r1=789124&r2=789125&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java Sun Jun 28 17:05:49 2009 @@ -38,14 +38,18 @@ */ public class MetadataExtractor { - private POIXMLTextExtractor extractor; + private final POIXMLTextExtractor extractor; - public MetadataExtractor(POIXMLTextExtractor extractor) { + private final String type; + + public MetadataExtractor(POIXMLTextExtractor extractor, String type) { this.extractor = extractor; + this.type = type; } public void extract(Metadata metadata) throws TikaException { try { + addProperty(metadata, Metadata.CONTENT_TYPE, type); extractMetadata(extractor.getCoreProperties(), metadata); extractMetadata(extractor.getExtendedProperties(), metadata); } catch (IOException e) { @@ -64,8 +68,6 @@ addProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty()); addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder .getContentStatusProperty()); - addProperty(metadata, Metadata.CONTENT_TYPE, propsHolder - .getContentType()); addProperty(metadata, Metadata.DATE, propsHolder .getCreatedPropertyString()); addProperty(metadata, Metadata.CREATOR, propsHolder Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=789125&r1=789124&r2=789125&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java Sun Jun 28 17:05:49 2009 @@ -23,7 +23,7 @@ public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor { public POIXMLTextExtractorDecorator(POIXMLTextExtractor extractor) { - super(extractor); + super(extractor, null); } @Override Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=789125&r1=789124&r2=789125&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Sun Jun 28 17:05:49 2009 @@ -39,7 +39,7 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { public XSLFPowerPointExtractorDecorator(XSLFPowerPointExtractor extractor) { - super(extractor); + super(extractor, "application/vnd.openxmlformats-officedocument.presentationml.presentation"); } /** Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=789125&r1=789124&r2=789125&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Sun Jun 28 17:05:49 2009 @@ -35,7 +35,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { public XSSFExcelExtractorDecorator(XSSFExcelExtractor extractor) { - super(extractor); + super(extractor, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); } /** Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=789125&r1=789124&r2=789125&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Sun Jun 28 17:05:49 2009 @@ -39,7 +39,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { public XWPFWordExtractorDecorator(XWPFWordExtractor extractor) { - super(extractor); + super(extractor, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); } /** Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=789125&r1=789124&r2=789125&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original) +++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Sun Jun 28 17:05:49 2009 @@ -84,12 +84,38 @@ assertTypeByName("text/html", "x.html"); assertTypeByName("application/xhtml+xml", "x.xhtml"); assertTypeByName("application/xml", "x.xml"); - assertTypeByName("application/msword", "x.doc"); - assertTypeByName("application/vnd.ms-powerpoint", "x.ppt"); - assertTypeByName("application/vnd.ms-excel", "x.xls"); assertTypeByName("application/zip", "x.zip"); assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt"); assertTypeByName("application/octet-stream", "x.xyz"); + + // Test for the MS Office media types and file extensions listed in + // http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx + assertTypeByName("application/msword", "x.doc"); + assertTypeByName("application/msword", "x.dot"); + assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "x.docx"); + assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template", "x.dotx"); + assertTypeByName("application/vnd.ms-word.document.macroenabled.12", "x.docm"); + assertTypeByName("application/vnd.ms-word.template.macroenabled.12", "x.dotm"); + assertTypeByName("application/vnd.ms-excel", "x.xls"); + assertTypeByName("application/vnd.ms-excel", "x.xlt"); + assertTypeByName("application/vnd.ms-excel", "x.xla"); + assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "x.xlsx"); + assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template", "x.xltx"); + assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", "x.xlsm"); + assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", "x.xltm"); + assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", "x.xlam"); + assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", "x.xlsb"); + assertTypeByName("application/vnd.ms-powerpoint", "x.ppt"); + assertTypeByName("application/vnd.ms-powerpoint", "x.pot"); + assertTypeByName("application/vnd.ms-powerpoint", "x.pps"); + assertTypeByName("application/vnd.ms-powerpoint", "x.ppa"); + assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation", "x.pptx"); + assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template", "x.potx"); + assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow", "x.ppsx"); + assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", "x.ppam"); + assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", "x.pptm"); + assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", "x.potm"); + assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "x.ppsm"); } public void testJpegDetection() throws Exception { Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=789125&r1=789124&r2=789125&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original) +++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Sun Jun 28 17:05:49 2009 @@ -44,7 +44,7 @@ parser.parse(input, handler, metadata); assertEquals( - "application/vnd.openxmlformats-package.core-properties+xml", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Simple Excel document", metadata.get(Metadata.TITLE)); assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); @@ -74,7 +74,7 @@ parser.parse(input, handler, metadata); assertEquals( - "application/vnd.openxmlformats-package.core-properties+xml", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE)); assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); @@ -101,7 +101,7 @@ parser.parse(input, handler, metadata); assertEquals( - "application/vnd.openxmlformats-package.core-properties+xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Sample Word Document", metadata.get(Metadata.TITLE)); assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
