TIKA-2042 MBOX magic and detection unit test
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/65cc9bce Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/65cc9bce Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/65cc9bce Branch: refs/heads/2.x Commit: 65cc9bcecdc6b86294a88f3b2b6b26017f356ae5 Parents: 31374a3 Author: Nick Burch <[email protected]> Authored: Tue Jul 26 11:36:29 2016 +0100 Committer: Nick Burch <[email protected]> Committed: Tue Jul 26 12:06:50 2016 +0100 ---------------------------------------------------------------------- .../java/org/apache/tika/mime/TestMimeTypes.java | 3 +++ .../org/apache/tika/mime/tika-mimetypes.xml | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/65cc9bce/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 09864b8..d4840b7 100644 --- a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -833,6 +833,9 @@ public class TestMimeTypes extends TikaTest { // Lotus assertTypeDetection("testLotusEml.eml", "message/rfc822"); + // MBOX + assertTypeDetection("headers.mbox", "application/mbox"); + // Thunderbird - doesn't currently work by name assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml"); } http://git-wip-us.apache.org/repos/asf/tika/blob/65cc9bce/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index e07f449..1d1f70a 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -361,9 +361,22 @@ <mime-type type="application/mbms-register+xml"/> <mime-type type="application/mbms-register-response+xml"/> <mime-type type="application/mbms-user-service-description+xml"/> + <mime-type type="application/mbox"> - <sub-class-of type="text/plain"/> + <!-- MBOX files start with "From [sender] [date]" --> + <!-- To avoid false matches, check for other headers after that --> + <magic priority="70"> + <match value="From " type="string" offset="0"> + <match value="\nFrom: " type="string" offset="32:256"/> + <match value="\nDate: " type="string" offset="32:256"/> + <match value="\nDelivered-To: " type="string" offset="32:256"/> + <match value="\nReceived: by " type="string" offset="32:256"/> + <match value="\nReceived: via " type="string" offset="32:256"/> + <match value="\nReceived: from " type="string" offset="32:256"/> + </match> + </magic> <glob pattern="*.mbox"/> + <sub-class-of type="text/x-tika-text-based-message"/> </mime-type> <mime-type type="application/media_control+xml"/> <mime-type type="application/mediaservercontrol+xml">
