Author: jukka
Date: Sun Jun 28 17:05:49 2009
New Revision: 789125

URL: http://svn.apache.org/viewvc?rev=789125&view=rev
Log:
TIKA-253: Better mime type for ooxml files

Updated office media type settings based on the table at 
http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx

Modified:
    
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=789125&r1=789124&r2=789125&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 Sun Jun 28 17:05:49 2009
@@ -155,6 +155,10 @@
     <root-XML namespaceURI="http://www.w3.org/1999/xhtml"; localName="html" />
   </mime-type>
 
+  <!-- ===================================================================== 
-->
+  <!-- Microsoft Office binary file formats                                  
-->
+  <!-- http://www.microsoft.com/interop/docs/OfficeBinaryFormats.mspx        
-->
+  <!-- ===================================================================== 
-->
 
   <mime-type type="application/x-tika-msoffice">
     <magic>
@@ -164,20 +168,49 @@
 
   <!-- http://www.iana.org/assignments/media-types/application/vnd.visio -->
   <mime-type type="application/vnd.visio">
+    <comment>Microsoft Visio Diagram</comment>
     <glob pattern="*.vsd" />
     <glob pattern="*.vst" />
     <glob pattern="*.vsw" />
     <glob pattern="*.vss" />
+    <sub-class-of type="application/x-tika-msoffice"/>
   </mime-type>
 
+  <!-- 
http://www.iana.org/assignments/media-types/application/vnd.ms-powerpoint -->
   <mime-type type="application/vnd.ms-powerpoint">
+    <comment>Microsoft Powerpoint Presentation</comment>
     <glob pattern="*.ppz" />
     <glob pattern="*.ppt" />
     <glob pattern="*.pps" />
     <glob pattern="*.pot" />
+    <glob pattern="*.ppa" />
+    <alias type="application/mspowerpoint" />
+    <sub-class-of type="application/x-tika-msoffice"/>
+  </mime-type>
+
+  <mime-type type="application/vnd.ms-powerpoint.addin.macroenabled.12">
+    <glob pattern="*.ppam"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
+  </mime-type>
+
+  <mime-type type="application/vnd.ms-powerpoint.presentation.macroenabled.12">
+    <glob pattern="*.pptm"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
+  </mime-type>
+
+  <mime-type type="application/vnd.ms-powerpoint.presentation.macroenabled.12">
+    <glob pattern="*.potm"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
+  </mime-type>
+
+  <mime-type type="application/vnd.ms-powerpoint.slideshow.macroenabled.12">
+    <glob pattern="*.ppsm"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
   </mime-type>
 
+  <!-- http://www.iana.org/assignments/media-types/application/vnd.ms-excel -->
   <mime-type type="application/vnd.ms-excel">
+    <comment>Microsoft Excel Spreadsheet</comment>
     <magic priority="50">
       <match value="Microsoft\ Excel\ 5.0\ Worksheet" type="string" 
offset="2080" />
       <match value="Foglio\ di\ lavoro\ Microsoft\ Exce" type="string" 
offset="2080" />
@@ -194,17 +227,116 @@
     <glob pattern="*.xlt" />
     <glob pattern="*.xld" />
     <alias type="application/msexcel" />
+    <sub-class-of type="application/x-tika-msoffice"/>
+  </mime-type>
+
+  <mime-type type="application/vnd.ms-excel.sheet.macroenabled.12">
+    <glob pattern="*.xlsm"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
+  </mime-type>
+
+  <mime-type type="application/vnd.ms-excel.template.macroenabled.12">
+    <glob pattern="*.xltm"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
+  </mime-type>
+
+  <mime-type type="application/vnd.ms-excel.addin.macroenabled.12">
+    <glob pattern="*.xlam"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
+  </mime-type>
+
+  <mime-type type="application/vnd.ms-excel.sheet.binary.macroenabled.12">
+    <glob pattern="*.xlsb"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
+  </mime-type>
+
+  <!-- http://www.iana.org/assignments/media-types/application/msword -->
+  <mime-type type="application/msword">
+    <comment>Microsoft Word Document</comment>
+    <magic priority="50">
+      <match value="Microsoft\ Word\ 6.0\ Document" type="string" 
offset="2080" />
+      <match value="Documento\ Microsoft\ Word\ 6" type="string" offset="2080" 
/>
+      <match value="MSWordDoc" type="string" offset="2112" />
+      <match value="0x31be0000" type="big32" offset="0" />
+      <match value="PO^Q`" type="string" offset="0" />
+      <match value="\376\067\0\043" type="string" offset="0" />
+      <match value="\333\245-\0\0\0" type="string" offset="0" />
+      <match value="\354\245\301" type="string" offset="512" />
+      <match value="\320\317\021\340\241\261\032\341" type="string" offset="0" 
/>
+      <match value="\224\246\056" type="string" offset="0" />
+      <match value="R\0o\0o\0t\0\ \0E\0n\0t\0r\0y" type="string" offset="512" 
/>
+    </magic>
+    <glob pattern="*.doc" />
+    <glob pattern="*.dot" />
+    <alias type="application/vnd.ms-word" />
+    <sub-class-of type="application/x-tika-msoffice"/>
+  </mime-type>
+
+  <mime-type type="application/vnd.ms-word.document.macroenabled.12">
+    <glob pattern="*.docm"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
+  </mime-type>
+
+  <mime-type type="application/vnd.ms-word.template.macroenabled.12">
+    <glob pattern="*.dotm"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
   </mime-type>
 
   <mime-type type="application/vnd.ms-outlook">
+    <comment>Microsoft Outlook Message</comment>
     <glob pattern="*.msg" />
+    <sub-class-of type="application/x-tika-msoffice"/>
   </mime-type>
 
-  <mime-type type="application/vnd.openxmlformats-package.core-properties+xml">
+  <!-- ===================================================================== 
-->
+  <!-- Office Open XML file formats                                          
-->
+  <!-- http://www.ecma-international.org/publications/standards/Ecma-376.htm 
-->
+  <!-- ===================================================================== 
-->
+
+  <mime-type type="application/x-tika-ooxml">
     <sub-class-of type="application/zip"/>
-    <glob pattern="*.docx" />
-    <glob pattern="*.pptx" />
-    <glob pattern="*.xlsx" />
+  </mime-type>
+
+  <mime-type 
type="application/vnd.openxmlformats-officedocument.presentationml.presentation">
+    <comment>Office Open XML Presentation</comment>
+    <glob pattern="*.pptx"/>
+    <sub-class-of type="application/x-tika-ooxml"/>
+  </mime-type>
+
+  <mime-type 
type="application/vnd.openxmlformats-officedocument.presentationml.template">
+    <comment>Office Open XML Presentation Template</comment>
+    <glob pattern="*.potx"/>
+    <sub-class-of type="application/x-tika-ooxml"/>
+  </mime-type>
+
+  <mime-type 
type="application/vnd.openxmlformats-officedocument.presentationml.slideshow">
+    <comment>Office Open XML Presentation Slideshow</comment>
+    <glob pattern="*.ppsx"/>
+    <sub-class-of type="application/x-tika-ooxml"/>
+  </mime-type>
+
+  <mime-type 
type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet">
+    <comment>Office Open XML Spreadsheet</comment>
+    <glob pattern="*.xlsx"/>
+    <sub-class-of type="application/x-tika-ooxml"/>
+  </mime-type>
+
+  <mime-type 
type="application/vnd.openxmlformats-officedocument.spreadsheetml.template">
+    <comment>Office Open XML Spreadsheet Template</comment>
+    <glob pattern="*.xltx"/>
+    <sub-class-of type="application/x-tika-ooxml"/>
+  </mime-type>
+
+  <mime-type 
type="application/vnd.openxmlformats-officedocument.wordprocessingml.document">
+    <comment>Office Open XML Document</comment>
+    <glob pattern="*.docx"/>
+    <sub-class-of type="application/x-tika-ooxml"/>
+  </mime-type>
+
+  <mime-type 
type="application/vnd.openxmlformats-officedocument.wordprocessingml.template">
+    <comment>Office Open XML Document Template</comment>
+    <glob pattern="*.dotx"/>
+    <sub-class-of type="application/x-tika-ooxml"/>
   </mime-type>
 
   <!-- ===================================================================== 
-->
@@ -468,24 +600,6 @@
     <glob pattern="*.class" />
   </mime-type>
 
-  <mime-type type="application/msword">
-    <magic priority="50">
-      <match value="Microsoft\ Word\ 6.0\ Document" type="string" 
offset="2080" />
-      <match value="Documento\ Microsoft\ Word\ 6" type="string" offset="2080" 
/>
-      <match value="MSWordDoc" type="string" offset="2112" />
-      <match value="0x31be0000" type="big32" offset="0" />
-      <match value="PO^Q`" type="string" offset="0" />
-      <match value="\376\067\0\043" type="string" offset="0" />
-      <match value="\333\245-\0\0\0" type="string" offset="0" />
-      <match value="\354\245\301" type="string" offset="512" />
-      <match value="\320\317\021\340\241\261\032\341" type="string" offset="0" 
/>
-      <match value="\224\246\056" type="string" offset="0" />
-      <match value="R\0o\0o\0t\0\ \0E\0n\0t\0r\0y" type="string" offset="512" 
/>
-    </magic>
-    <glob pattern="*.doc" />
-    <alias type="application/vnd.ms-word" />
-  </mime-type>
-
   <mime-type type="application/octet-stream">
     <magic priority="50">
       <match value="#\ This\ is\ a\ shell\ archive" type="string" offset="10" 
/>

Modified: 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml?rev=789125&r1=789124&r2=789125&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml 
(original)
+++ 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml 
Sun Jun 28 17:05:49 2009
@@ -30,18 +30,32 @@
 
         <parser name="parse-office" 
class="org.apache.tika.parser.microsoft.OfficeParser">
                 <mime>application/x-tika-msoffice</mime>
-                <mime>application/msword</mime>
-                <mime>application/vnd.ms-excel</mime>
-                <mime>application/vnd.ms-powerpoint</mime>
                 <mime>application/vnd.visio</mime>
+                <mime>application/vnd.ms-powerpoint</mime>
+                
<mime>application/vnd.ms-powerpoint.addin.macroenabled.12</mime>
+                
<mime>application/vnd.ms-powerpoint.presentation.macroenabled.12</mime>
+                
<mime>application/vnd.ms-powerpoint.presentation.macroenabled.12</mime>
+                
<mime>application/vnd.ms-powerpoint.slideshow.macroenabled.12</mime>
+                <mime>application/vnd.ms-excel</mime>
+                <mime>application/vnd.ms-excel.sheet.macroenabled.12</mime>
+                <mime>application/vnd.ms-excel.template.macroenabled.12</mime>
+                <mime>application/vnd.ms-excel.addin.macroenabled.12</mime>
+                
<mime>application/vnd.ms-excel.sheet.binary.macroenabled.12</mime>
+                <mime>application/msword</mime>
+                <mime>application/vnd.ms-word.document.macroenabled.12</mime>
+                <mime>application/vnd.ms-word.template.macroenabled.12</mime>
                 <mime>application/vnd.ms-outlook</mime>
         </parser>
         
         <parser name="parse-ooxml" 
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
-                
<mime>application/vnd.openxmlformats-package.core-properties+xml</mime>
-                
<mime>application/vnd.openxmlformats-officedocument.spreadsheetml.sheet</mime>
+                <mime>application/x-tika-ooxml</mime>
                 
<mime>application/vnd.openxmlformats-officedocument.presentationml.presentation</mime>
+                
<mime>application/vnd.openxmlformats-officedocument.presentationml.template</mime>
+                
<mime>application/vnd.openxmlformats-officedocument.presentationml.slideshow</mime>
+                
<mime>application/vnd.openxmlformats-officedocument.spreadsheetml.sheet</mime>
+                
<mime>application/vnd.openxmlformats-officedocument.spreadsheetml.template</mime>
                 
<mime>application/vnd.openxmlformats-officedocument.wordprocessingml.document</mime>
+                
<mime>application/vnd.openxmlformats-officedocument.wordprocessingml.template</mime>
         </parser>
 
         <parser name="parse-html" 
class="org.apache.tika.parser.html.HtmlParser">

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=789125&r1=789124&r2=789125&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 Sun Jun 28 17:05:49 2009
@@ -37,8 +37,11 @@
 public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
     protected POIXMLTextExtractor extractor;
 
-    public AbstractOOXMLExtractor(POIXMLTextExtractor extractor) {
+    private final String type;
+
+    public AbstractOOXMLExtractor(POIXMLTextExtractor extractor, String type) {
         this.extractor = extractor;
+        this.type = type;
     }
 
     /**
@@ -52,7 +55,7 @@
      * @see 
org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
      */
     public MetadataExtractor getMetadataExtractor() {
-        return new MetadataExtractor(extractor);
+        return new MetadataExtractor(extractor, type);
     }
 
     /**

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=789125&r1=789124&r2=789125&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 Sun Jun 28 17:05:49 2009
@@ -38,14 +38,18 @@
  */
 public class MetadataExtractor {
 
-    private POIXMLTextExtractor extractor;
+    private final POIXMLTextExtractor extractor;
 
-    public MetadataExtractor(POIXMLTextExtractor extractor) {
+    private final String type;
+
+    public MetadataExtractor(POIXMLTextExtractor extractor, String type) {
         this.extractor = extractor;
+        this.type = type;
     }
 
     public void extract(Metadata metadata) throws TikaException {
         try {
+            addProperty(metadata, Metadata.CONTENT_TYPE, type);
             extractMetadata(extractor.getCoreProperties(), metadata);
             extractMetadata(extractor.getExtendedProperties(), metadata);
         } catch (IOException e) {
@@ -64,8 +68,6 @@
         addProperty(metadata, Metadata.CATEGORY, 
propsHolder.getCategoryProperty());
         addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
                 .getContentStatusProperty());
-        addProperty(metadata, Metadata.CONTENT_TYPE, propsHolder
-                .getContentType());
         addProperty(metadata, Metadata.DATE, propsHolder
                 .getCreatedPropertyString());
         addProperty(metadata, Metadata.CREATOR, propsHolder

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=789125&r1=789124&r2=789125&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
 Sun Jun 28 17:05:49 2009
@@ -23,7 +23,7 @@
 public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
 
     public POIXMLTextExtractorDecorator(POIXMLTextExtractor extractor) {
-        super(extractor);
+        super(extractor, null);
     }
 
     @Override

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=789125&r1=789124&r2=789125&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 Sun Jun 28 17:05:49 2009
@@ -39,7 +39,7 @@
 public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
 
     public XSLFPowerPointExtractorDecorator(XSLFPowerPointExtractor extractor) 
{
-        super(extractor);
+        super(extractor, 
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
     }
 
     /**

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=789125&r1=789124&r2=789125&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 Sun Jun 28 17:05:49 2009
@@ -35,7 +35,7 @@
 public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
 
     public XSSFExcelExtractorDecorator(XSSFExcelExtractor extractor) {
-        super(extractor);
+        super(extractor, 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
     }
 
     /**

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=789125&r1=789124&r2=789125&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 Sun Jun 28 17:05:49 2009
@@ -39,7 +39,7 @@
 public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
 
     public XWPFWordExtractorDecorator(XWPFWordExtractor extractor) {
-        super(extractor);
+        super(extractor, 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
     }
 
     /**

Modified: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=789125&r1=789124&r2=789125&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
 Sun Jun 28 17:05:49 2009
@@ -84,12 +84,38 @@
         assertTypeByName("text/html", "x.html");
         assertTypeByName("application/xhtml+xml", "x.xhtml");
         assertTypeByName("application/xml", "x.xml");
-        assertTypeByName("application/msword", "x.doc");
-        assertTypeByName("application/vnd.ms-powerpoint", "x.ppt");
-        assertTypeByName("application/vnd.ms-excel", "x.xls");
         assertTypeByName("application/zip", "x.zip");
         assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt");
         assertTypeByName("application/octet-stream", "x.xyz");
+
+        // Test for the MS Office media types and file extensions listed in
+        // 
http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx
+        assertTypeByName("application/msword", "x.doc");
+        assertTypeByName("application/msword", "x.dot");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 "x.docx");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template",
 "x.dotx");
+        assertTypeByName("application/vnd.ms-word.document.macroenabled.12", 
"x.docm");
+        assertTypeByName("application/vnd.ms-word.template.macroenabled.12", 
"x.dotm");
+        assertTypeByName("application/vnd.ms-excel", "x.xls");
+        assertTypeByName("application/vnd.ms-excel", "x.xlt");
+        assertTypeByName("application/vnd.ms-excel", "x.xla");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 "x.xlsx");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template",
 "x.xltx");
+        assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", 
"x.xlsm");
+        assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", 
"x.xltm");
+        assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", 
"x.xlam");
+        
assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", 
"x.xlsb");
+        assertTypeByName("application/vnd.ms-powerpoint", "x.ppt");
+        assertTypeByName("application/vnd.ms-powerpoint", "x.pot");
+        assertTypeByName("application/vnd.ms-powerpoint", "x.pps");
+        assertTypeByName("application/vnd.ms-powerpoint", "x.ppa");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation",
 "x.pptx");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template",
 "x.potx");
+        
assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow",
 "x.ppsx");
+        
assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", 
"x.ppam");
+        
assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", 
"x.pptm");
+        
assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", 
"x.potm");
+        
assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", 
"x.ppsm");
     }
 
     public void testJpegDetection() throws Exception {

Modified: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=789125&r1=789124&r2=789125&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Sun Jun 28 17:05:49 2009
@@ -44,7 +44,7 @@
             parser.parse(input, handler, metadata);
             
             assertEquals(
-                    
"application/vnd.openxmlformats-package.core-properties+xml",
+                    
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                     metadata.get(Metadata.CONTENT_TYPE));
             assertEquals("Simple Excel document", 
metadata.get(Metadata.TITLE));
             assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
@@ -74,7 +74,7 @@
             parser.parse(input, handler, metadata);
             
             assertEquals(
-                    
"application/vnd.openxmlformats-package.core-properties+xml",
+                    
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
                     metadata.get(Metadata.CONTENT_TYPE));
             assertEquals("Sample Powerpoint Slide", 
metadata.get(Metadata.TITLE));
             assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
@@ -101,7 +101,7 @@
             parser.parse(input, handler, metadata);
             
             assertEquals(
-                    
"application/vnd.openxmlformats-package.core-properties+xml",
+                    
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                     metadata.get(Metadata.CONTENT_TYPE));
             assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
             assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));


Reply via email to