This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4153
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d70e2e9ad929ae85cb4a126de53f67afc7555c27
Author: tballison <[email protected]>
AuthorDate: Wed Oct 11 09:21:15 2023 -0400

    TIKA-4153 -- tighten rfc822 match to require a known header at offset 0
---
 .../org/apache/tika/mime/tika-mimetypes.xml        | 20 ++----
 .../org/apache/tika/mime/RFC822DetectionTest.java  | 74 ++++++++++++++++++++++
 2 files changed, 78 insertions(+), 16 deletions(-)

diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 366c687e7..766a7b4df 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -6867,22 +6867,10 @@
       </match>
     </magic>
     <magic priority="45">
-      <!-- be a bit more flexible, but require two of these -->
-      <match minShouldMatch="2">
-        <match value="Date:" type="stringignorecase" offset="0"/>
-        <match value="Delivered-To:" type="string" offset="0"/>
-        <match value="From:" type="stringignorecase" offset="0"/>
-        <match value="Message-ID:" type="stringignorecase" offset="0"/>
-        <match value="MIME-Version:" type="stringignorecase" offset="0"/>
-        <match value="Received:" type="stringignorecase" offset="0"/>
-        <match value="Relay-Version:" type="stringignorecase" offset="0"/>
-        <match value="Return-Path:" type="stringignorecase" offset="0"/>
-        <match value="Sent:" type="string" offset="0"/>
-        <match value="Status:" type="string" offset="0"/>
-        <match value="User-Agent:" type="string" offset="0"/>
-        <match value="X-Mailer:" type="string" offset="0"/>
-        <match value="X-Originating-IP:" type="stringignorecase" offset="0"/>
-
+      <!-- be a bit more flexible, but require two of these
+      TODO: fix this mess by developing <and/> <or/> clauses
+      -->
+      <match 
value="(?:(?:(?i)Content-ID|Content-Location|Content-Transfer-Encoding|Content-Type|Date|From|Message-ID|MIME-Version|Received|Relay-Version|Return-Path|X-Originating-IP)|(?:Delivered-To|Sent|Status|Subject|To|User-Agent|X-Mailer)):"
 type="regex" offset="0">
         <match value="\nContent-ID:" type="stringignorecase" offset="0:1024"/>
         <match value="\nContent-Location:" type="stringignorecase" 
offset="0:1024"/>
         <match value="\nContent-Transfer-Encoding:" type="stringignorecase" 
offset="0:1024"/>
diff --git 
a/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java 
b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java
new file mode 100644
index 000000000..e622edd00
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+
+public class RFC822DetectionTest {
+
+    private static final MimeTypes MIME_TYPES = 
TikaConfig.getDefaultConfig().getMimeRepository();
+
+    @Test
+    public void testRFC822() throws Exception {
+        for (String txt : new String[]{
+                "Date: blah\nSent: someone\r\nthis is a test",
+                "date: blah\nSent: someone\r\nthis is a test",
+                "date: blah\nDelivered-To: someone\r\nthis is a test"
+        }) {
+            assertMime("message/rfc822", txt);
+        }
+        for (String txt : new String[]{
+                //test missing colon
+                "Date blah\nSent: someone\r\nthis is a test",
+                //test precursor junk
+                "some precursor junk Date: blah\nSent: someone\r\nthis is a 
test",
+                "some precursor junk\nDate: blah\nSent: someone\r\nthis is a 
test",
+                "some precursor junk:\nDate: blah\nSent: someone\r\nthis is a 
test",
+                //confirm that date is case-insensitive, but delivered-to is 
case-sensitive
+                "date: blah\ndelivered-To: someone\r\nthis is a test"
+        }) {
+            assertMime("text/plain", txt);
+        }
+
+        //TIKA-4153, specifically
+        String txt = "Some text here 1.\n" + "Some text here 2.\n" + "Some 
text here 3.\n" +
+                "Original Message-----\n" + "From: [email protected]\n" +
+                "Sent: Thursday, October 31, 2019 9:52 AM\n" +
+                "To: Some person, (The XYZ group)\n" +
+                "Subject: RE: Mr. Random person phone call: MESSAGE\n" + 
"Hi,\n" +
+                "I am available now to receive the call.\n" + "Some text here 
4.\n" +
+                "Some text here 5.\n" + "Some text here 6.";
+        assertMime("text/plain", txt);
+    }
+
+    private void assertMime(String expected, String txt) throws IOException {
+
+        MediaType mediaType =
+                MIME_TYPES.detect(UnsynchronizedByteArrayInputStream.builder()
+                        
.setByteArray(txt.getBytes(StandardCharsets.UTF_8)).get(), new Metadata());
+        assertEquals(expected, mediaType.toString(), txt);
+    }
+}

Reply via email to