This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4153 in repository https://gitbox.apache.org/repos/asf/tika.git
commit d70e2e9ad929ae85cb4a126de53f67afc7555c27 Author: tballison <[email protected]> AuthorDate: Wed Oct 11 09:21:15 2023 -0400 TIKA-4153 -- tighten rfc822 match to require a known header at offset 0 --- .../org/apache/tika/mime/tika-mimetypes.xml | 20 ++---- .../org/apache/tika/mime/RFC822DetectionTest.java | 74 ++++++++++++++++++++++ 2 files changed, 78 insertions(+), 16 deletions(-) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 366c687e7..766a7b4df 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -6867,22 +6867,10 @@ </match> </magic> <magic priority="45"> - <!-- be a bit more flexible, but require two of these --> - <match minShouldMatch="2"> - <match value="Date:" type="stringignorecase" offset="0"/> - <match value="Delivered-To:" type="string" offset="0"/> - <match value="From:" type="stringignorecase" offset="0"/> - <match value="Message-ID:" type="stringignorecase" offset="0"/> - <match value="MIME-Version:" type="stringignorecase" offset="0"/> - <match value="Received:" type="stringignorecase" offset="0"/> - <match value="Relay-Version:" type="stringignorecase" offset="0"/> - <match value="Return-Path:" type="stringignorecase" offset="0"/> - <match value="Sent:" type="string" offset="0"/> - <match value="Status:" type="string" offset="0"/> - <match value="User-Agent:" type="string" offset="0"/> - <match value="X-Mailer:" type="string" offset="0"/> - <match value="X-Originating-IP:" type="stringignorecase" offset="0"/> - + <!-- be a bit more flexible, but require two of these + TODO: fix this mess by developing <and/> <or/> clauses + --> + <match value="(?:(?:(?i)Content-ID|Content-Location|Content-Transfer-Encoding|Content-Type|Date|From|Message-ID|MIME-Version|Received|Relay-Version|Return-Path|X-Originating-IP)|(?:Delivered-To|Sent|Status|Subject|To|User-Agent|X-Mailer)):" type="regex" offset="0"> <match value="\nContent-ID:" type="stringignorecase" offset="0:1024"/> <match value="\nContent-Location:" type="stringignorecase" offset="0:1024"/> <match value="\nContent-Transfer-Encoding:" type="stringignorecase" offset="0:1024"/> diff --git a/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java new file mode 100644 index 000000000..e622edd00 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.mime; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; +import org.junit.jupiter.api.Test; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; + +public class RFC822DetectionTest { + + private static final MimeTypes MIME_TYPES = TikaConfig.getDefaultConfig().getMimeRepository(); + + @Test + public void testRFC822() throws Exception { + for (String txt : new String[]{ + "Date: blah\nSent: someone\r\nthis is a test", + "date: blah\nSent: someone\r\nthis is a test", + "date: blah\nDelivered-To: someone\r\nthis is a test" + }) { + assertMime("message/rfc822", txt); + } + for (String txt : new String[]{ + //test missing colon + "Date blah\nSent: someone\r\nthis is a test", + //test precursor junk + "some precursor junk Date: blah\nSent: someone\r\nthis is a test", + "some precursor junk\nDate: blah\nSent: someone\r\nthis is a test", + "some precursor junk:\nDate: blah\nSent: someone\r\nthis is a test", + //confirm that date is case-insensitive, but delivered-to is case-sensitive + "date: blah\ndelivered-To: someone\r\nthis is a test" + }) { + assertMime("text/plain", txt); + } + + //TIKA-4153, specifically + String txt = "Some text here 1.\n" + "Some text here 2.\n" + "Some text here 3.\n" + + "Original Message-----\n" + "From: [email protected]\n" + + "Sent: Thursday, October 31, 2019 9:52 AM\n" + + "To: Some person, (The XYZ group)\n" + + "Subject: RE: Mr. Random person phone call: MESSAGE\n" + "Hi,\n" + + "I am available now to receive the call.\n" + "Some text here 4.\n" + + "Some text here 5.\n" + "Some text here 6."; + assertMime("text/plain", txt); + } + + private void assertMime(String expected, String txt) throws IOException { + + MediaType mediaType = + MIME_TYPES.detect(UnsynchronizedByteArrayInputStream.builder() + .setByteArray(txt.getBytes(StandardCharsets.UTF_8)).get(), new Metadata()); + assertEquals(expected, mediaType.toString(), txt); + } +}
