Repository: tika Updated Branches: refs/heads/master f00ab040d -> 72d2d88b3
TIKA-2042 MBOX magic and detection unit test Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/72d2d88b Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/72d2d88b Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/72d2d88b Branch: refs/heads/master Commit: 72d2d88b381ba75942ae791042ef54af33ee1f38 Parents: f00ab04 Author: Nick Burch <[email protected]> Authored: Tue Jul 26 11:36:29 2016 +0100 Committer: Nick Burch <[email protected]> Committed: Tue Jul 26 11:36:29 2016 +0100 ---------------------------------------------------------------------- .../org/apache/tika/mime/tika-mimetypes.xml | 15 ++++++++++++++- .../java/org/apache/tika/mime/TestMimeTypes.java | 3 +++ 2 files changed, 17 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/72d2d88b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index b39f529..22a814c 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -365,9 +365,22 @@ <mime-type type="application/mbms-register+xml"/> <mime-type type="application/mbms-register-response+xml"/> <mime-type type="application/mbms-user-service-description+xml"/> + <mime-type type="application/mbox"> - <sub-class-of type="text/plain"/> + <!-- MBOX files start with "From [sender] [date]" --> + <!-- To avoid false matches, check for other headers after that --> + <magic priority="70"> + <match value="From " type="string" offset="0"> + <match value="\nFrom: " type="string" offset="32:256"/> + <match value="\nDate: " type="string" offset="32:256"/> + <match value="\nDelivered-To: " type="string" offset="32:256"/> + <match value="\nReceived: by " type="string" offset="32:256"/> + <match value="\nReceived: via " type="string" offset="32:256"/> + <match value="\nReceived: from " type="string" offset="32:256"/> + </match> + </magic> <glob pattern="*.mbox"/> + <sub-class-of type="text/x-tika-text-based-message"/> </mime-type> <mime-type type="application/media_control+xml"/> <mime-type type="application/mediaservercontrol+xml"> http://git-wip-us.apache.org/repos/asf/tika/blob/72d2d88b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 81b154c..d35a716 100644 --- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -869,6 +869,9 @@ public class TestMimeTypes { // Lotus assertTypeDetection("testLotusEml.eml", "message/rfc822"); + // MBOX + assertTypeDetection("headers.mbox", "application/mbox"); + // Thunderbird - doesn't currently work by name assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml"); }
