This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_2x by this push:
     new d25ec9b6b TIKA-4153 -- tighten RFC822 detection (#1396)
     new 5a9ef38f0 Merge remote-tracking branch 'origin/branch_2x' into 
branch_2x
d25ec9b6b is described below

commit d25ec9b6bfd0a7d702242e4a64a0215347a6d2b3
Author: Tim Allison <talli...@apache.org>
AuthorDate: Wed Oct 11 10:18:10 2023 -0400

    TIKA-4153 -- tighten RFC822 detection (#1396)
    
    * TIKA-4153 -- tighten rfc822 match to require a known header at offset 0
    
    (cherry picked from commit c2e6b01b070e7961dc37d043018f10d41223fd9f)
---
 .../org/apache/tika/mime/tika-mimetypes.xml        | 95 ++++++++++++----------
 .../org/apache/tika/mime/RFC822DetectionTest.java  | 79 ++++++++++++++++++
 2 files changed, 133 insertions(+), 41 deletions(-)

diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 366c687e7..b49e355e6 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -2135,12 +2135,16 @@
     <!-- should have a higher priority than rfc822 - TIKA-3489 -->
     <magic priority="55">
       <match minShouldMatch="2">
-        <match value="user-agent:" type="stringignorecase" offset="0"/>
-        <match value="sitemap:" type="stringignorecase" offset="0"/>
-        <match value="\nuser-agent:" type="stringignorecase" offset="0:1000"/>
-        <match value="\nallow:" type="stringignorecase" offset="0:1000"/>
-        <match value="\ndisallow:" type="stringignorecase" offset="0:1000"/>
-        <match value="\nsitemap:" type="stringignorecase" offset="0:1000"/>
+        <match minShouldMatch="1">
+          <match value="user-agent:" type="stringignorecase" offset="0"/>
+          <match value="sitemap:" type="stringignorecase" offset="0"/>
+        </match>
+        <match minShouldMatch="1">
+          <match value="\nuser-agent:" type="stringignorecase" 
offset="0:1000"/>
+          <match value="\nallow:" type="stringignorecase" offset="0:1000"/>
+          <match value="\ndisallow:" type="stringignorecase" offset="0:1000"/>
+          <match value="\nsitemap:" type="stringignorecase" offset="0:1000"/>
+        </match>
       </match>
     </magic>
     <sub-class-of type="text/plain"/>
@@ -6867,42 +6871,51 @@
       </match>
     </magic>
     <magic priority="45">
-      <!-- be a bit more flexible, but require two of these -->
+      <!-- be a bit more flexible, but require one from each of these -->
       <match minShouldMatch="2">
-        <match value="Date:" type="stringignorecase" offset="0"/>
-        <match value="Delivered-To:" type="string" offset="0"/>
-        <match value="From:" type="stringignorecase" offset="0"/>
-        <match value="Message-ID:" type="stringignorecase" offset="0"/>
-        <match value="MIME-Version:" type="stringignorecase" offset="0"/>
-        <match value="Received:" type="stringignorecase" offset="0"/>
-        <match value="Relay-Version:" type="stringignorecase" offset="0"/>
-        <match value="Return-Path:" type="stringignorecase" offset="0"/>
-        <match value="Sent:" type="string" offset="0"/>
-        <match value="Status:" type="string" offset="0"/>
-        <match value="User-Agent:" type="string" offset="0"/>
-        <match value="X-Mailer:" type="string" offset="0"/>
-        <match value="X-Originating-IP:" type="stringignorecase" offset="0"/>
-
-        <match value="\nContent-ID:" type="stringignorecase" offset="0:1024"/>
-        <match value="\nContent-Location:" type="stringignorecase" 
offset="0:1024"/>
-        <match value="\nContent-Transfer-Encoding:" type="stringignorecase" 
offset="0:1024"/>
-        <match value="\nContent-Type:" type="stringignorecase" 
offset="0:1024"/>
-        <match value="\nDate:" type="stringignorecase" offset="0:1024"/>
-        <match value="\nDelivered-To:" type="string" offset="0:1024"/>
-        <match value="\nFrom:" type="stringignorecase" offset="0:1024"/>
-        <match value="\nMIME-Version:" type="stringignorecase" 
offset="0:1024"/>
-        <match value="\nReceived:" type="stringignorecase" offset="0:1024"/>
-        <match value="\nRelay-Version:" type="stringignorecase" 
offset="0:1024"/>
-        <match value="\nReturn-Path:" type="stringignorecase" offset="0:1024"/>
-        <match value="\nSent:" type="string" offset="0:1024"/>
-        <match value="\nStatus:" type="string" offset="0:1024"/>
-        <match value="\nSubject:" type="string" offset="0:1024"/>
-        <match value="\nTo:" type="string" offset="0:1024"/>
-        <match value="\nUser-Agent:" type="string" offset="0:1024"/>
-        <match value="\nX-Mailer:" type="string" offset="0:1024"/>
-        <match value="\nX-Originating-IP:" type="stringignorecase" 
offset="0:1024"/>
-        <match value="\nDKIM-" type="string" offset="0:1024"/>
-        <match value="\nARC-" type="string" offset="0:1024"/>
+        <match minShouldMatch="1">
+          <match value="Content-ID:" type="stringignorecase" offset="0"/>
+          <match value="Content-Location:" type="stringignorecase" offset="0"/>
+          <match value="Content-Transfer-Encoding:" type="stringignorecase" 
offset="0"/>
+          <match value="Content-Type:" type="stringignorecase" offset="0"/>
+          <match value="Date:" type="stringignorecase" offset="0"/>
+          <match value="Delivered-To:" type="string" offset="0"/>
+          <match value="From:" type="stringignorecase" offset="0"/>
+          <match value="Message-ID:" type="stringignorecase" offset="0"/>
+          <match value="MIME-Version:" type="stringignorecase" offset="0"/>
+          <match value="Received:" type="stringignorecase" offset="0"/>
+          <match value="Relay-Version:" type="stringignorecase" offset="0"/>
+          <match value="Return-Path:" type="stringignorecase" offset="0"/>
+          <match value="Sent:" type="string" offset="0"/>
+          <match value="Status:" type="string" offset="0"/>
+          <match value="Subject:" type="string" offset="0"/>
+          <match value="To:" type="string" offset="0"/>
+          <match value="User-Agent:" type="string" offset="0"/>
+          <match value="X-Mailer:" type="string" offset="0"/>
+          <match value="X-Originating-IP:" type="stringignorecase" offset="0"/>
+        </match>
+        <match minShouldMatch="1">
+          <match value="\nContent-ID:" type="stringignorecase" 
offset="0:1024"/>
+          <match value="\nContent-Location:" type="stringignorecase" 
offset="0:1024"/>
+          <match value="\nContent-Transfer-Encoding:" type="stringignorecase" 
offset="0:1024"/>
+          <match value="\nContent-Type:" type="stringignorecase" 
offset="0:1024"/>
+          <match value="\nDate:" type="stringignorecase" offset="0:1024"/>
+          <match value="\nDelivered-To:" type="string" offset="0:1024"/>
+          <match value="\nFrom:" type="stringignorecase" offset="0:1024"/>
+          <match value="\nMIME-Version:" type="stringignorecase" 
offset="0:1024"/>
+          <match value="\nReceived:" type="stringignorecase" offset="0:1024"/>
+          <match value="\nRelay-Version:" type="stringignorecase" 
offset="0:1024"/>
+          <match value="\nReturn-Path:" type="stringignorecase" 
offset="0:1024"/>
+          <match value="\nSent:" type="string" offset="0:1024"/>
+          <match value="\nStatus:" type="string" offset="0:1024"/>
+          <match value="\nSubject:" type="string" offset="0:1024"/>
+          <match value="\nTo:" type="string" offset="0:1024"/>
+          <match value="\nUser-Agent:" type="string" offset="0:1024"/>
+          <match value="\nX-Mailer:" type="string" offset="0:1024"/>
+          <match value="\nX-Originating-IP:" type="stringignorecase" 
offset="0:1024"/>
+          <match value="\nDKIM-" type="string" offset="0:1024"/>
+          <match value="\nARC-" type="string" offset="0:1024"/>
+        </match>
       </match>
     </magic>
     <magic priority="40">
diff --git 
a/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java 
b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java
new file mode 100644
index 000000000..7340e06a2
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+
+public class RFC822DetectionTest {
+
+    private static final MimeTypes MIME_TYPES = 
TikaConfig.getDefaultConfig().getMimeRepository();
+
+    @Test
+    public void testBasic() throws Exception {
+        for (String txt : new String[]{
+                "Date: blah\nSent: someone\r\nthis is a test",
+                "date: blah\nSent: someone\r\nthis is a test",
+                "date: blah\nDelivered-To: someone\r\nthis is a test"
+        }) {
+            assertMime("message/rfc822", txt);
+        }
+        for (String txt : new String[]{
+                //test missing colon
+                "Date blah\nSent: someone\r\nthis is a test",
+                //test precursor junk
+                "some precursor junk Date: blah\nSent: someone\r\nthis is a 
test",
+                "some precursor junk\nDate: blah\nSent: someone\r\nthis is a 
test",
+                "some precursor junk:\nDate: blah\nSent: someone\r\nthis is a 
test",
+                //confirm that date is case-insensitive, but delivered-to is 
case-sensitive
+                "date: blah\ndelivered-To: someone\r\nthis is a test",
+                //test that a file that starts only with "Subject:" and no 
other header is
+                //detected as text/plain
+                "Subject: this is a subject\nand there's some other text",
+                "To: someone\nand there's some other text",
+                "To: someone or other"
+        }) {
+            assertMime("text/plain", txt);
+        }
+
+        //TIKA-4153, specifically
+        String txt = "Some text here 1.\n" + "Some text here 2.\n" + "Some 
text here 3.\n" +
+                "Original Message-----\n" + "From: some_m...@abc.com\n" +
+                "Sent: Thursday, October 31, 2019 9:52 AM\n" +
+                "To: Some person, (The XYZ group)\n" +
+                "Subject: RE: Mr. Random person phone call: MESSAGE\n" + 
"Hi,\n" +
+                "I am available now to receive the call.\n" + "Some text here 
4.\n" +
+                "Some text here 5.\n" + "Some text here 6.";
+        assertMime("text/plain", txt);
+    }
+
+    private void assertMime(String expected, String txt) throws IOException {
+
+        MediaType mediaType =
+                MIME_TYPES.detect(UnsynchronizedByteArrayInputStream.builder()
+                        
.setByteArray(txt.getBytes(StandardCharsets.UTF_8)).get(), new Metadata());
+        assertEquals(expected, mediaType.toString(), txt);
+    }
+}

Reply via email to