Repository: tika Updated Branches: refs/heads/master b2821d921 -> de6dbd0a7
TIKA-1971 - add another magic for rfc822 Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e08d0065 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e08d0065 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e08d0065 Branch: refs/heads/master Commit: e08d00654a77267686a112e0665c324ef041c033 Parents: bc0b1f7 Author: tballison <[email protected]> Authored: Tue May 17 10:05:35 2016 -0400 Committer: tballison <[email protected]> Committed: Tue May 17 10:05:35 2016 -0400 ---------------------------------------------------------------------- .../org/apache/tika/mime/tika-mimetypes.xml | 12 +++++++ .../tika/parser/mail/RFC822ParserTest.java | 11 +++++++ .../test-documents/testRFC822_date_utf8 | 8 +++++ .../resources/test-documents/testRFC822_eml | 33 ++++++++++++++++++++ 4 files changed, 64 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/e08d0065/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index d82bd95..11b2c3f 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -679,6 +679,17 @@ <sub-class-of type="text/plain"/> </mime-type> + <mime-type type="application/x-dbf"> + <_comment>DBF/dbase 3 File</_comment> + <magic priority="100"> + <match value="(?s)^[\x02\x03\x30\x31\x32\x43\x63\x83\x8B\xCB\xF5\xE5\xFB].[\x01-\x0C][\x01-\x1F].{4}(?:.[^\x00]|[\x41-\xFF].)(?:[^\x00\x01].|.[^\x00]).{31}([\x00][^\x00]{0,10})[BCDFGILMNOPQTVWXY@+]" type="regex" offset="0"/> + </magic> + <glob pattern="*.dbf"/> + <glob pattern="*.dbase"/> + <glob pattern="*.dbase3"/> + <sub-class-of type="text/plain"/> + </mime-type> + <mime-type type="application/rtx"/> <mime-type type="application/samlassertion+xml"/> <mime-type type="application/samlmetadata+xml"/> @@ -5299,6 +5310,7 @@ <mime-type type="message/rfc822"> <magic priority="50"> + <match value="Delivered-To:" type="string" offset="0"/> <match value="Status:" type="string" offset="0"/> <match value="X-Mozilla-Status:" type="string" offset="0"/> <match value="X-Mozilla-Status2:" type="string" offset="0"/> http://git-wip-us.apache.org/repos/asf/tika/blob/e08d0065/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index a80e544..0e8f613 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -381,4 +381,15 @@ public class RFC822ParserTest extends TikaTest { assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1)); assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2)); } + + @Test + public void testDetection() throws Exception { + //test simple text file + XMLResult r = getXML("testRFC822_date_utf8"); + assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE)); + + //test without extension + r = getXML("testRFC822_eml"); + assertEquals("message/rfc822", r.metadata.get(Metadata.CONTENT_TYPE)); + } } http://git-wip-us.apache.org/repos/asf/tika/blob/e08d0065/tika-parsers/src/test/resources/test-documents/testRFC822_date_utf8 ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testRFC822_date_utf8 b/tika-parsers/src/test/resources/test-documents/testRFC822_date_utf8 new file mode 100644 index 0000000..c3fd0b2 --- /dev/null +++ b/tika-parsers/src/test/resources/test-documents/testRFC822_date_utf8 @@ -0,0 +1,8 @@ +From: Philipp Steinkrüger <[email protected]> +Subject: Testemail +Date: 16 May 2016 at 09:30:32 GMT+1 +To: Philipp Steinkrüger <[email protected]> + + + GMT+1 + http://git-wip-us.apache.org/repos/asf/tika/blob/e08d0065/tika-parsers/src/test/resources/test-documents/testRFC822_eml ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testRFC822_eml b/tika-parsers/src/test/resources/test-documents/testRFC822_eml new file mode 100644 index 0000000..5ae32ae --- /dev/null +++ b/tika-parsers/src/test/resources/test-documents/testRFC822_eml @@ -0,0 +1,33 @@ +Delivered-To: [email protected] +Received: by 10.114.185.229 with SMTP id ff5csp1359148ldc; + Mon, 16 May 2016 01:30:35 -0700 (PDT) +X-Received: by 10.194.165.226 with SMTP id zb2mr30717859wjb.172.1463387435671; + Mon, 16 May 2016 01:30:35 -0700 (PDT) +Return-Path: <[email protected]> +Received: from smtp-out.rrz.uni-koeln.de (smtp-out.rrz.uni-koeln.de. [2a00:a200:0:12::25]) + by mx.google.com with ESMTPS id tk4si20973899wjb.25.2016.05.16.01.30.35 + for <[email protected]> + (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); + Mon, 16 May 2016 01:30:35 -0700 (PDT) +Received-SPF: pass (google.com: domain of [email protected] designates 2a00:a200:0:12::25 as permitted sender) client-ip=2a00:a200:0:12::25; +Authentication-Results: mx.google.com; + spf=pass (google.com: domain of [email protected] designates 2a00:a200:0:12::25 as permitted sender) [email protected] +Received: from smtp-auth.rrz.uni-koeln.de (smtp-auth.rrz.uni-koeln.de [IPv6:2a00:a200:0:10::27] (may be forged)) + by smtp-out.rrz.uni-koeln.de (8.14.4/8.14.4) with ESMTP id u4G8UYXw029242 + for <[email protected]>; Mon, 16 May 2016 10:30:34 +0200 +Received: from [192.168.1.10] (79-66-108-216.dynamic.dsl.as9105.com [79.66.108.216]) + (authenticated as user altj4 using DIGEST-MD5 bits=0) + by smtp-auth.uni-koeln.de (8.13.8/8.13.8) with ESMTP id u4G8UXlP028450 + (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=NO) + for <[email protected]>; Mon, 16 May 2016 10:30:34 +0200 +From: =?utf-8?Q?Philipp_Steinkr=C3=BCger?= <[email protected]> +Content-Type: text/plain +Content-Transfer-Encoding: 7bit +Subject: Testemail +Message-Id: <[email protected]> +Date: Mon, 16 May 2016 09:30:32 +0100 +To: =?utf-8?Q?Philipp_Steinkr=C3=BCger?= <[email protected]> +Mime-Version: 1.0 (Mac OS X Mail 9.3 \(3124\)) +X-Mailer: Apple Mail (2.3124) +X-Scanned-By: MIMEDefang 2.75 +
