This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_2x by this push: new d25ec9b6b TIKA-4153 -- tighten RFC822 detection (#1396) new 5a9ef38f0 Merge remote-tracking branch 'origin/branch_2x' into branch_2x d25ec9b6b is described below commit d25ec9b6bfd0a7d702242e4a64a0215347a6d2b3 Author: Tim Allison <talli...@apache.org> AuthorDate: Wed Oct 11 10:18:10 2023 -0400 TIKA-4153 -- tighten RFC822 detection (#1396) * TIKA-4153 -- tighten rfc822 match to require a known header at offset 0 (cherry picked from commit c2e6b01b070e7961dc37d043018f10d41223fd9f) --- .../org/apache/tika/mime/tika-mimetypes.xml | 95 ++++++++++++---------- .../org/apache/tika/mime/RFC822DetectionTest.java | 79 ++++++++++++++++++ 2 files changed, 133 insertions(+), 41 deletions(-) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 366c687e7..b49e355e6 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -2135,12 +2135,16 @@ <!-- should have a higher priority than rfc822 - TIKA-3489 --> <magic priority="55"> <match minShouldMatch="2"> - <match value="user-agent:" type="stringignorecase" offset="0"/> - <match value="sitemap:" type="stringignorecase" offset="0"/> - <match value="\nuser-agent:" type="stringignorecase" offset="0:1000"/> - <match value="\nallow:" type="stringignorecase" offset="0:1000"/> - <match value="\ndisallow:" type="stringignorecase" offset="0:1000"/> - <match value="\nsitemap:" type="stringignorecase" offset="0:1000"/> + <match minShouldMatch="1"> + <match value="user-agent:" type="stringignorecase" offset="0"/> + <match value="sitemap:" type="stringignorecase" offset="0"/> + </match> + <match minShouldMatch="1"> + <match value="\nuser-agent:" type="stringignorecase" offset="0:1000"/> + <match value="\nallow:" type="stringignorecase" offset="0:1000"/> + <match value="\ndisallow:" type="stringignorecase" offset="0:1000"/> + <match value="\nsitemap:" type="stringignorecase" offset="0:1000"/> + </match> </match> </magic> <sub-class-of type="text/plain"/> @@ -6867,42 +6871,51 @@ </match> </magic> <magic priority="45"> - <!-- be a bit more flexible, but require two of these --> + <!-- be a bit more flexible, but require one from each of these --> <match minShouldMatch="2"> - <match value="Date:" type="stringignorecase" offset="0"/> - <match value="Delivered-To:" type="string" offset="0"/> - <match value="From:" type="stringignorecase" offset="0"/> - <match value="Message-ID:" type="stringignorecase" offset="0"/> - <match value="MIME-Version:" type="stringignorecase" offset="0"/> - <match value="Received:" type="stringignorecase" offset="0"/> - <match value="Relay-Version:" type="stringignorecase" offset="0"/> - <match value="Return-Path:" type="stringignorecase" offset="0"/> - <match value="Sent:" type="string" offset="0"/> - <match value="Status:" type="string" offset="0"/> - <match value="User-Agent:" type="string" offset="0"/> - <match value="X-Mailer:" type="string" offset="0"/> - <match value="X-Originating-IP:" type="stringignorecase" offset="0"/> - - <match value="\nContent-ID:" type="stringignorecase" offset="0:1024"/> - <match value="\nContent-Location:" type="stringignorecase" offset="0:1024"/> - <match value="\nContent-Transfer-Encoding:" type="stringignorecase" offset="0:1024"/> - <match value="\nContent-Type:" type="stringignorecase" offset="0:1024"/> - <match value="\nDate:" type="stringignorecase" offset="0:1024"/> - <match value="\nDelivered-To:" type="string" offset="0:1024"/> - <match value="\nFrom:" type="stringignorecase" offset="0:1024"/> - <match value="\nMIME-Version:" type="stringignorecase" offset="0:1024"/> - <match value="\nReceived:" type="stringignorecase" offset="0:1024"/> - <match value="\nRelay-Version:" type="stringignorecase" offset="0:1024"/> - <match value="\nReturn-Path:" type="stringignorecase" offset="0:1024"/> - <match value="\nSent:" type="string" offset="0:1024"/> - <match value="\nStatus:" type="string" offset="0:1024"/> - <match value="\nSubject:" type="string" offset="0:1024"/> - <match value="\nTo:" type="string" offset="0:1024"/> - <match value="\nUser-Agent:" type="string" offset="0:1024"/> - <match value="\nX-Mailer:" type="string" offset="0:1024"/> - <match value="\nX-Originating-IP:" type="stringignorecase" offset="0:1024"/> - <match value="\nDKIM-" type="string" offset="0:1024"/> - <match value="\nARC-" type="string" offset="0:1024"/> + <match minShouldMatch="1"> + <match value="Content-ID:" type="stringignorecase" offset="0"/> + <match value="Content-Location:" type="stringignorecase" offset="0"/> + <match value="Content-Transfer-Encoding:" type="stringignorecase" offset="0"/> + <match value="Content-Type:" type="stringignorecase" offset="0"/> + <match value="Date:" type="stringignorecase" offset="0"/> + <match value="Delivered-To:" type="string" offset="0"/> + <match value="From:" type="stringignorecase" offset="0"/> + <match value="Message-ID:" type="stringignorecase" offset="0"/> + <match value="MIME-Version:" type="stringignorecase" offset="0"/> + <match value="Received:" type="stringignorecase" offset="0"/> + <match value="Relay-Version:" type="stringignorecase" offset="0"/> + <match value="Return-Path:" type="stringignorecase" offset="0"/> + <match value="Sent:" type="string" offset="0"/> + <match value="Status:" type="string" offset="0"/> + <match value="Subject:" type="string" offset="0"/> + <match value="To:" type="string" offset="0"/> + <match value="User-Agent:" type="string" offset="0"/> + <match value="X-Mailer:" type="string" offset="0"/> + <match value="X-Originating-IP:" type="stringignorecase" offset="0"/> + </match> + <match minShouldMatch="1"> + <match value="\nContent-ID:" type="stringignorecase" offset="0:1024"/> + <match value="\nContent-Location:" type="stringignorecase" offset="0:1024"/> + <match value="\nContent-Transfer-Encoding:" type="stringignorecase" offset="0:1024"/> + <match value="\nContent-Type:" type="stringignorecase" offset="0:1024"/> + <match value="\nDate:" type="stringignorecase" offset="0:1024"/> + <match value="\nDelivered-To:" type="string" offset="0:1024"/> + <match value="\nFrom:" type="stringignorecase" offset="0:1024"/> + <match value="\nMIME-Version:" type="stringignorecase" offset="0:1024"/> + <match value="\nReceived:" type="stringignorecase" offset="0:1024"/> + <match value="\nRelay-Version:" type="stringignorecase" offset="0:1024"/> + <match value="\nReturn-Path:" type="stringignorecase" offset="0:1024"/> + <match value="\nSent:" type="string" offset="0:1024"/> + <match value="\nStatus:" type="string" offset="0:1024"/> + <match value="\nSubject:" type="string" offset="0:1024"/> + <match value="\nTo:" type="string" offset="0:1024"/> + <match value="\nUser-Agent:" type="string" offset="0:1024"/> + <match value="\nX-Mailer:" type="string" offset="0:1024"/> + <match value="\nX-Originating-IP:" type="stringignorecase" offset="0:1024"/> + <match value="\nDKIM-" type="string" offset="0:1024"/> + <match value="\nARC-" type="string" offset="0:1024"/> + </match> </match> </magic> <magic priority="40"> diff --git a/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java new file mode 100644 index 000000000..7340e06a2 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.mime; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; +import org.junit.jupiter.api.Test; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; + +public class RFC822DetectionTest { + + private static final MimeTypes MIME_TYPES = TikaConfig.getDefaultConfig().getMimeRepository(); + + @Test + public void testBasic() throws Exception { + for (String txt : new String[]{ + "Date: blah\nSent: someone\r\nthis is a test", + "date: blah\nSent: someone\r\nthis is a test", + "date: blah\nDelivered-To: someone\r\nthis is a test" + }) { + assertMime("message/rfc822", txt); + } + for (String txt : new String[]{ + //test missing colon + "Date blah\nSent: someone\r\nthis is a test", + //test precursor junk + "some precursor junk Date: blah\nSent: someone\r\nthis is a test", + "some precursor junk\nDate: blah\nSent: someone\r\nthis is a test", + "some precursor junk:\nDate: blah\nSent: someone\r\nthis is a test", + //confirm that date is case-insensitive, but delivered-to is case-sensitive + "date: blah\ndelivered-To: someone\r\nthis is a test", + //test that a file that starts only with "Subject:" and no other header is + //detected as text/plain + "Subject: this is a subject\nand there's some other text", + "To: someone\nand there's some other text", + "To: someone or other" + }) { + assertMime("text/plain", txt); + } + + //TIKA-4153, specifically + String txt = "Some text here 1.\n" + "Some text here 2.\n" + "Some text here 3.\n" + + "Original Message-----\n" + "From: some_m...@abc.com\n" + + "Sent: Thursday, October 31, 2019 9:52 AM\n" + + "To: Some person, (The XYZ group)\n" + + "Subject: RE: Mr. Random person phone call: MESSAGE\n" + "Hi,\n" + + "I am available now to receive the call.\n" + "Some text here 4.\n" + + "Some text here 5.\n" + "Some text here 6."; + assertMime("text/plain", txt); + } + + private void assertMime(String expected, String txt) throws IOException { + + MediaType mediaType = + MIME_TYPES.detect(UnsynchronizedByteArrayInputStream.builder() + .setByteArray(txt.getBytes(StandardCharsets.UTF_8)).get(), new Metadata()); + assertEquals(expected, mediaType.toString(), txt); + } +}