Author: jukka
Date: Mon Jul 13 20:24:28 2009
New Revision: 793696
URL: http://svn.apache.org/viewvc?rev=793696&view=rev
Log:
TIKA-257: Uncorrect mime-type detection for ooxml
I found a pretty reliable magic byte pattern for ooxml files!
Modified:
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified:
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=793696&r1=793695&r2=793696&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
(original)
+++
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Mon Jul 13 20:24:28 2009
@@ -251,6 +251,11 @@
<mime-type type="application/x-tika-ooxml">
<sub-class-of type="application/zip"/>
+ <magic priority="50">
+ <match value="PK\003\004" type="string" offset="0">
+ <match value="[Content_Types].xml" type="string" offset="30"/>
+ </match>
+ </magic>
</mime-type>
<mime-type
type="application/vnd.openxmlformats-officedocument.presentationml.presentation">
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=793696&r1=793695&r2=793696&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
Mon Jul 13 20:24:28 2009
@@ -62,13 +62,12 @@
* @see
org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
* org.apache.tika.metadata.Metadata)
*/
- public XHTMLContentHandler getXHTML(ContentHandler handler,
- Metadata metadata) throws SAXException, XmlException, IOException {
+ public void getXHTML(ContentHandler handler, Metadata metadata)
+ throws SAXException, XmlException, IOException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
buildXHTML(xhtml);
xhtml.endDocument();
- return xhtml;
}
/**
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java?rev=793696&r1=793695&r2=793696&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
Mon Jul 13 20:24:28 2009
@@ -21,7 +21,6 @@
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -47,9 +46,9 @@
MetadataExtractor getMetadataExtractor();
/**
- * Returns to clients a {...@link XHTMLContentHandler} object representing
the
- * parsed content of a document as XHTML SAX events.
+ * Parses the document into a sequence of XHTML SAX events sent to the
+ * given content handler.
*/
- XHTMLContentHandler getXHTML(ContentHandler handler, Metadata metadata)
+ void getXHTML(ContentHandler handler, Metadata metadata)
throws SAXException, XmlException, IOException;
}
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=793696&r1=793695&r2=793696&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
Mon Jul 13 20:24:28 2009
@@ -47,9 +47,8 @@
OOXMLExtractor extractor = OOXMLExtractorFactory
.createExtractor((POIXMLTextExtractor) ExtractorFactory
.createExtractor(stream));
- extractor.getXHTML(handler, metadata);
extractor.getMetadataExtractor().extract(metadata);
-
+ extractor.getXHTML(handler, metadata);
} catch (InvalidFormatException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (OpenXML4JException e) {
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=793696&r1=793695&r2=793696&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Mon Jul 13 20:24:28 2009
@@ -118,6 +118,12 @@
assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12",
"x.ppsm");
}
+ public void testOoxmlDetection() throws Exception {
+ assertTypeByData("application/x-tika-ooxml", "testWORD.docx");
+ assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx");
+ assertTypeByData("application/x-tika-ooxml", "testPPT.pptx");
+ }
+
public void testJpegDetection() throws Exception {
assertType("image/jpeg", "testJPEG.jpg");
assertTypeByData("image/jpeg", "testJPEG.jpg");