This is an automated email from the ASF dual-hosted git repository. nick pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9a2c7d89e03ca7c0e821b69c394165297edfb9d4 Author: Nick Burch <n...@gagravarr.org> AuthorDate: Thu Sep 6 09:28:14 2018 +0100 Mime magic for "MIME Encapsulation of Aggregate HTML Documents" (MHTML), pulled out from rfc822 (may not be fully correct long-term...) --- .../org/apache/tika/mime/tika-mimetypes.xml | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 007ec53..bd1adfa 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -5980,9 +5980,28 @@ </magic> <glob pattern="*.eml"/> <glob pattern="*.mime"/> + <sub-class-of type="text/x-tika-text-based-message"/> + </mime-type> + + <!-- TODO See TIKA-2723 for discussions on the mime type hierarchy --> + <!-- and best parser structure for these email-like formats --> + <mime-type type="multipart/related"> + <acronym>MHTML</acronym> + <_comment>MIME Encapsulation of Aggregate HTML Documents</_comment> + <tika:link>http://tools.ietf.org/html/rfc2557</tika:link> + <alias type="application/x-mimearchive"/> + <alias type="message/rfc2557"/> + <!-- higher priority than message/rfc822 --> + <magic priority="60"> + <match value="From: \x3cSaved by Windows Internet Explorer 8\x3e" type="stringignorecase" offset="0"/> + <match value="From: \x22Saved by Internet Explorer 11\x22" type="stringignorecase" offset="0"/> + <match value="MIME-Version: 1.0" type="string" offset="0"> + <match value="\nContent-Type: multipart/related" type="string" offset="16:512"/> + </match> + </magic> <glob pattern="*.mht"/> <glob pattern="*.mhtml"/> - <sub-class-of type="text/x-tika-text-based-message"/> + <sub-class-of type="message/rfc822"/> </mime-type> <mime-type type="message/s-http"/> @@ -6084,7 +6103,6 @@ <mime-type type="multipart/header-set"/> <mime-type type="multipart/mixed"/> <mime-type type="multipart/parallel"/> - <mime-type type="multipart/related"/> <mime-type type="multipart/report"/> <mime-type type="multipart/signed"/> <mime-type type="multipart/voice-message"/>