Author: jukka
Date: Mon Jul 13 20:24:28 2009
New Revision: 793696

URL: http://svn.apache.org/viewvc?rev=793696&view=rev
Log:
TIKA-257: Uncorrect mime-type detection for ooxml

I found a pretty reliable magic byte pattern for ooxml files!

Modified:
    
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=793696&r1=793695&r2=793696&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 Mon Jul 13 20:24:28 2009
@@ -251,6 +251,11 @@
 
   <mime-type type="application/x-tika-ooxml">
     <sub-class-of type="application/zip"/>
+    <magic priority="50">
+      <match value="PK\003\004" type="string" offset="0">
+        <match value="[Content_Types].xml" type="string" offset="30"/>
+      </match>
+    </magic>
   </mime-type>
 
   <mime-type 
type="application/vnd.openxmlformats-officedocument.presentationml.presentation">

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=793696&r1=793695&r2=793696&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 Mon Jul 13 20:24:28 2009
@@ -62,13 +62,12 @@
      * @see 
org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
      *      org.apache.tika.metadata.Metadata)
      */
-    public XHTMLContentHandler getXHTML(ContentHandler handler,
-            Metadata metadata) throws SAXException, XmlException, IOException {
+    public void getXHTML(ContentHandler handler, Metadata metadata)
+            throws SAXException, XmlException, IOException {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
         buildXHTML(xhtml);
         xhtml.endDocument();
-        return xhtml;
     }
 
     /**

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java?rev=793696&r1=793695&r2=793696&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
 Mon Jul 13 20:24:28 2009
@@ -21,7 +21,6 @@
 import org.apache.poi.POIXMLDocument;
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -47,9 +46,9 @@
     MetadataExtractor getMetadataExtractor();
 
     /**
-     * Returns to clients a {...@link XHTMLContentHandler} object representing 
the
-     * parsed content of a document as XHTML SAX events.
+     * Parses the document into a sequence of XHTML SAX events sent to the
+     * given content handler.
      */
-    XHTMLContentHandler getXHTML(ContentHandler handler, Metadata metadata)
+    void getXHTML(ContentHandler handler, Metadata metadata)
             throws SAXException, XmlException, IOException;
 }

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=793696&r1=793695&r2=793696&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
 Mon Jul 13 20:24:28 2009
@@ -47,9 +47,8 @@
             OOXMLExtractor extractor = OOXMLExtractorFactory
                     .createExtractor((POIXMLTextExtractor) ExtractorFactory
                             .createExtractor(stream));
-            extractor.getXHTML(handler, metadata);
             extractor.getMetadataExtractor().extract(metadata);
-
+            extractor.getXHTML(handler, metadata);
         } catch (InvalidFormatException e) {
             throw new TikaException("Error creating OOXML extractor", e);
         } catch (OpenXML4JException e) {

Modified: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=793696&r1=793695&r2=793696&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
 Mon Jul 13 20:24:28 2009
@@ -118,6 +118,12 @@
         
assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", 
"x.ppsm");
     }
 
+    public void testOoxmlDetection() throws Exception {
+        assertTypeByData("application/x-tika-ooxml", "testWORD.docx");
+        assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx");
+        assertTypeByData("application/x-tika-ooxml", "testPPT.pptx");
+    }
+
     public void testJpegDetection() throws Exception {
         assertType("image/jpeg", "testJPEG.jpg");
         assertTypeByData("image/jpeg", "testJPEG.jpg");


Reply via email to