Author: nick
Date: Thu Jul 29 16:54:46 2010
New Revision: 980506
URL: http://svn.apache.org/viewvc?rev=980506&view=rev
Log:
Slightly improve OLE2 file type matches, for cases where the OLE2 properties
stream is in one of the first couple of blocks in the file. Add a note about
using the ContainerAwareDetector for better results. (TIKA-447)
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=980506&r1=980505&r2=980506&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
(original)
+++
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Thu Jul 29 16:54:46 2010
@@ -209,6 +209,7 @@
<!-- http://www.iana.org/assignments/media-types/application/msword -->
<mime-type type="application/msword">
+ <!-- Use org.apache.tika.detect.ContainerAwareDetector for more reliable
detection of OLE2 documents -->
<alias type="application/vnd.ms-word"/>
<comment>Microsoft Word Document</comment>
<magic priority="50">
@@ -222,7 +223,9 @@
<match value="\354\245\301" type="string" offset="512"/>
<match value="\320\317\021\340\241\261\032\341" type="string"
offset="0"/>
<match value="\224\246\056" type="string" offset="0"/>
- <match value="R\0o\0o\0t\0\ \0E\0n\0t\0r\0y" type="string" offset="512"/>
+ <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
+ <match
value="W\x00o\x00r\x00d\x00D\x00o\x00c\x00u\x00m\x00e\x00n\x00t" type="string"
offset="1152:4096" />
+ </match>
</magic>
<glob pattern="*.doc"/>
<glob pattern="*.dot"/>
@@ -1158,6 +1161,7 @@
<!-- http://www.iana.org/assignments/media-types/application/vnd.ms-excel -->
<mime-type type="application/vnd.ms-excel">
+ <!-- Use org.apache.tika.detect.ContainerAwareDetector for more reliable
detection of OLE2 documents -->
<alias type="application/msexcel" />
<comment>Microsoft Excel Spreadsheet</comment>
<magic priority="50">
@@ -1166,6 +1170,9 @@
<match value="Biff5" type="string" offset="2114"/>
<match value="Biff5" type="string" offset="2121"/>
<match value="\x09\x04\x06\x00\x00\x00\x10\x00" type="string"
offset="0"/>
+ <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
+ <match value="W\x00o\x00r\x00k\x00b\x00o\x00o\x00k" type="string"
offset="1152:4096" />
+ </match>
</magic>
<glob pattern="*.xls"/>
<glob pattern="*.xlm"/>
@@ -1230,8 +1237,14 @@
<!--
http://www.iana.org/assignments/media-types/application/vnd.ms-powerpoint -->
<mime-type type="application/vnd.ms-powerpoint">
+ <!-- Use org.apache.tika.detect.ContainerAwareDetector for more reliable
detection of OLE2 documents -->
<alias type="application/mspowerpoint"/>
<comment>Microsoft Powerpoint Presentation</comment>
+ <magic priority="50">
+ <match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
+ <match value="P\x00o\x00w\x00e\x00r\x00P\x00o\x00i\x00n\x00t\x00
D\x00o\x00c\x00u\x00m\x00e\x00n\x00t" type="string" offset="1152:4096" />
+ </match>
+ </magic>
<glob pattern="*.ppz"/>
<glob pattern="*.ppt"/>
<glob pattern="*.pps"/>