This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_2x by this push:
new d25ec9b6b TIKA-4153 -- tighten RFC822 detection (#1396)
new 5a9ef38f0 Merge remote-tracking branch 'origin/branch_2x' into
branch_2x
d25ec9b6b is described below
commit d25ec9b6bfd0a7d702242e4a64a0215347a6d2b3
Author: Tim Allison <[email protected]>
AuthorDate: Wed Oct 11 10:18:10 2023 -0400
TIKA-4153 -- tighten RFC822 detection (#1396)
* TIKA-4153 -- tighten rfc822 match to require a known header at offset 0
(cherry picked from commit c2e6b01b070e7961dc37d043018f10d41223fd9f)
---
.../org/apache/tika/mime/tika-mimetypes.xml | 95 ++++++++++++----------
.../org/apache/tika/mime/RFC822DetectionTest.java | 79 ++++++++++++++++++
2 files changed, 133 insertions(+), 41 deletions(-)
diff --git
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 366c687e7..b49e355e6 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -2135,12 +2135,16 @@
<!-- should have a higher priority than rfc822 - TIKA-3489 -->
<magic priority="55">
<match minShouldMatch="2">
- <match value="user-agent:" type="stringignorecase" offset="0"/>
- <match value="sitemap:" type="stringignorecase" offset="0"/>
- <match value="\nuser-agent:" type="stringignorecase" offset="0:1000"/>
- <match value="\nallow:" type="stringignorecase" offset="0:1000"/>
- <match value="\ndisallow:" type="stringignorecase" offset="0:1000"/>
- <match value="\nsitemap:" type="stringignorecase" offset="0:1000"/>
+ <match minShouldMatch="1">
+ <match value="user-agent:" type="stringignorecase" offset="0"/>
+ <match value="sitemap:" type="stringignorecase" offset="0"/>
+ </match>
+ <match minShouldMatch="1">
+ <match value="\nuser-agent:" type="stringignorecase"
offset="0:1000"/>
+ <match value="\nallow:" type="stringignorecase" offset="0:1000"/>
+ <match value="\ndisallow:" type="stringignorecase" offset="0:1000"/>
+ <match value="\nsitemap:" type="stringignorecase" offset="0:1000"/>
+ </match>
</match>
</magic>
<sub-class-of type="text/plain"/>
@@ -6867,42 +6871,51 @@
</match>
</magic>
<magic priority="45">
- <!-- be a bit more flexible, but require two of these -->
+ <!-- be a bit more flexible, but require one from each of these -->
<match minShouldMatch="2">
- <match value="Date:" type="stringignorecase" offset="0"/>
- <match value="Delivered-To:" type="string" offset="0"/>
- <match value="From:" type="stringignorecase" offset="0"/>
- <match value="Message-ID:" type="stringignorecase" offset="0"/>
- <match value="MIME-Version:" type="stringignorecase" offset="0"/>
- <match value="Received:" type="stringignorecase" offset="0"/>
- <match value="Relay-Version:" type="stringignorecase" offset="0"/>
- <match value="Return-Path:" type="stringignorecase" offset="0"/>
- <match value="Sent:" type="string" offset="0"/>
- <match value="Status:" type="string" offset="0"/>
- <match value="User-Agent:" type="string" offset="0"/>
- <match value="X-Mailer:" type="string" offset="0"/>
- <match value="X-Originating-IP:" type="stringignorecase" offset="0"/>
-
- <match value="\nContent-ID:" type="stringignorecase" offset="0:1024"/>
- <match value="\nContent-Location:" type="stringignorecase"
offset="0:1024"/>
- <match value="\nContent-Transfer-Encoding:" type="stringignorecase"
offset="0:1024"/>
- <match value="\nContent-Type:" type="stringignorecase"
offset="0:1024"/>
- <match value="\nDate:" type="stringignorecase" offset="0:1024"/>
- <match value="\nDelivered-To:" type="string" offset="0:1024"/>
- <match value="\nFrom:" type="stringignorecase" offset="0:1024"/>
- <match value="\nMIME-Version:" type="stringignorecase"
offset="0:1024"/>
- <match value="\nReceived:" type="stringignorecase" offset="0:1024"/>
- <match value="\nRelay-Version:" type="stringignorecase"
offset="0:1024"/>
- <match value="\nReturn-Path:" type="stringignorecase" offset="0:1024"/>
- <match value="\nSent:" type="string" offset="0:1024"/>
- <match value="\nStatus:" type="string" offset="0:1024"/>
- <match value="\nSubject:" type="string" offset="0:1024"/>
- <match value="\nTo:" type="string" offset="0:1024"/>
- <match value="\nUser-Agent:" type="string" offset="0:1024"/>
- <match value="\nX-Mailer:" type="string" offset="0:1024"/>
- <match value="\nX-Originating-IP:" type="stringignorecase"
offset="0:1024"/>
- <match value="\nDKIM-" type="string" offset="0:1024"/>
- <match value="\nARC-" type="string" offset="0:1024"/>
+ <match minShouldMatch="1">
+ <match value="Content-ID:" type="stringignorecase" offset="0"/>
+ <match value="Content-Location:" type="stringignorecase" offset="0"/>
+ <match value="Content-Transfer-Encoding:" type="stringignorecase"
offset="0"/>
+ <match value="Content-Type:" type="stringignorecase" offset="0"/>
+ <match value="Date:" type="stringignorecase" offset="0"/>
+ <match value="Delivered-To:" type="string" offset="0"/>
+ <match value="From:" type="stringignorecase" offset="0"/>
+ <match value="Message-ID:" type="stringignorecase" offset="0"/>
+ <match value="MIME-Version:" type="stringignorecase" offset="0"/>
+ <match value="Received:" type="stringignorecase" offset="0"/>
+ <match value="Relay-Version:" type="stringignorecase" offset="0"/>
+ <match value="Return-Path:" type="stringignorecase" offset="0"/>
+ <match value="Sent:" type="string" offset="0"/>
+ <match value="Status:" type="string" offset="0"/>
+ <match value="Subject:" type="string" offset="0"/>
+ <match value="To:" type="string" offset="0"/>
+ <match value="User-Agent:" type="string" offset="0"/>
+ <match value="X-Mailer:" type="string" offset="0"/>
+ <match value="X-Originating-IP:" type="stringignorecase" offset="0"/>
+ </match>
+ <match minShouldMatch="1">
+ <match value="\nContent-ID:" type="stringignorecase"
offset="0:1024"/>
+ <match value="\nContent-Location:" type="stringignorecase"
offset="0:1024"/>
+ <match value="\nContent-Transfer-Encoding:" type="stringignorecase"
offset="0:1024"/>
+ <match value="\nContent-Type:" type="stringignorecase"
offset="0:1024"/>
+ <match value="\nDate:" type="stringignorecase" offset="0:1024"/>
+ <match value="\nDelivered-To:" type="string" offset="0:1024"/>
+ <match value="\nFrom:" type="stringignorecase" offset="0:1024"/>
+ <match value="\nMIME-Version:" type="stringignorecase"
offset="0:1024"/>
+ <match value="\nReceived:" type="stringignorecase" offset="0:1024"/>
+ <match value="\nRelay-Version:" type="stringignorecase"
offset="0:1024"/>
+ <match value="\nReturn-Path:" type="stringignorecase"
offset="0:1024"/>
+ <match value="\nSent:" type="string" offset="0:1024"/>
+ <match value="\nStatus:" type="string" offset="0:1024"/>
+ <match value="\nSubject:" type="string" offset="0:1024"/>
+ <match value="\nTo:" type="string" offset="0:1024"/>
+ <match value="\nUser-Agent:" type="string" offset="0:1024"/>
+ <match value="\nX-Mailer:" type="string" offset="0:1024"/>
+ <match value="\nX-Originating-IP:" type="stringignorecase"
offset="0:1024"/>
+ <match value="\nDKIM-" type="string" offset="0:1024"/>
+ <match value="\nARC-" type="string" offset="0:1024"/>
+ </match>
</match>
</magic>
<magic priority="40">
diff --git
a/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java
b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java
new file mode 100644
index 000000000..7340e06a2
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+
+public class RFC822DetectionTest {
+
+ private static final MimeTypes MIME_TYPES =
TikaConfig.getDefaultConfig().getMimeRepository();
+
+ @Test
+ public void testBasic() throws Exception {
+ for (String txt : new String[]{
+ "Date: blah\nSent: someone\r\nthis is a test",
+ "date: blah\nSent: someone\r\nthis is a test",
+ "date: blah\nDelivered-To: someone\r\nthis is a test"
+ }) {
+ assertMime("message/rfc822", txt);
+ }
+ for (String txt : new String[]{
+ //test missing colon
+ "Date blah\nSent: someone\r\nthis is a test",
+ //test precursor junk
+ "some precursor junk Date: blah\nSent: someone\r\nthis is a
test",
+ "some precursor junk\nDate: blah\nSent: someone\r\nthis is a
test",
+ "some precursor junk:\nDate: blah\nSent: someone\r\nthis is a
test",
+ //confirm that date is case-insensitive, but delivered-to is
case-sensitive
+ "date: blah\ndelivered-To: someone\r\nthis is a test",
+ //test that a file that starts only with "Subject:" and no
other header is
+ //detected as text/plain
+ "Subject: this is a subject\nand there's some other text",
+ "To: someone\nand there's some other text",
+ "To: someone or other"
+ }) {
+ assertMime("text/plain", txt);
+ }
+
+ //TIKA-4153, specifically
+ String txt = "Some text here 1.\n" + "Some text here 2.\n" + "Some
text here 3.\n" +
+ "Original Message-----\n" + "From: [email protected]\n" +
+ "Sent: Thursday, October 31, 2019 9:52 AM\n" +
+ "To: Some person, (The XYZ group)\n" +
+ "Subject: RE: Mr. Random person phone call: MESSAGE\n" +
"Hi,\n" +
+ "I am available now to receive the call.\n" + "Some text here
4.\n" +
+ "Some text here 5.\n" + "Some text here 6.";
+ assertMime("text/plain", txt);
+ }
+
+ private void assertMime(String expected, String txt) throws IOException {
+
+ MediaType mediaType =
+ MIME_TYPES.detect(UnsynchronizedByteArrayInputStream.builder()
+
.setByteArray(txt.getBytes(StandardCharsets.UTF_8)).get(), new Metadata());
+ assertEquals(expected, mediaType.toString(), txt);
+ }
+}